aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_gemm/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/kernels')
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp)11
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp)23
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp1546
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp)18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp3668
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp5400
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp2427
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp1802
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp1810
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp1934
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp)61
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp3430
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp2195
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp2072
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp3613
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp2434
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp1808
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp3335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp2072
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp2434
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp1808
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp3335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp328
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp)13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp)28
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp)26
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp)10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp)10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp)1854
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp)1558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp)1360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp)1076
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp)1854
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp)1558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp)1360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp)1076
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp)29
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp1372
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp2247
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp2237
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp3459
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp1633
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp2001
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp3778
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp)48
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp3178
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp2118
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp2236
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp)50
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp1751
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp3459
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp)46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp1602
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp2770
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp2137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp1904
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp)46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp1602
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp2137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp1904
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp)4250
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp)4250
133 files changed, 66183 insertions, 46980 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
index 0f0e5a7ed4..8bf8d8442e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
@@ -30,9 +30,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, int, int);
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
//
// This describes the characteristics of a family of kernels, in terms of
// the required interleave properties and the output block size.
@@ -40,7 +40,7 @@ void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, i
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class gemm_s16_12x8 {
+class cls_a64_gemm_s16_8x12 {
public:
typedef int16_t operand_type;
typedef int32_t result_type;
@@ -62,10 +62,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_s16_asimd_12x8;
+ kern_type kernel = a64_gemm_s16_asimd_8x12;
- gemm_s16_12x8(const CPUInfo *) { }
+ cls_a64_gemm_s16_8x12(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
index 7052f83a3d..a77938ffa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_s16_asimd_8x12(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
{
const int16_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 256acc4c65..b68a5f518a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -34,7 +34,7 @@ void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
#include "arm_gemm.hpp"
-class gemm_s8_4x4 {
+class cls_a64_gemm_s8_4x4 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -56,10 +56,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
kern_type kernel=a64_gemm_s8_4x4;
- gemm_s8_4x4(const CPUInfo *) { }
+ cls_a64_gemm_s8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 0e294bfe8d..eee817e8e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -32,11 +32,11 @@
namespace arm_gemm {
// Load the actual kernel
-void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class gemm_s8_12x8 {
+class cls_a64_gemm_s8_8x12 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,16 +58,17 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_s8_12x8;
+ kern_type kernel = a64_gemm_s8_8x12;
- gemm_s8_12x8(const CPUInfo *ci) {
+ cls_a64_gemm_s8_8x12(const CPUInfo *ci) {
auto mod = ci->get_cpu_model();
if (mod == CPUModel::A55r1) {
- kernel = a64_gemm_s8_12x8_a55r1;
+ kernel = a64_gemm_s8_8x12_a55r1;
} else if (mod == CPUModel::X1) {
- kernel = a64_gemm_s8_12x8_x1;
+ kernel = a64_gemm_s8_8x12_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
index ddd8124ec9..bb5226e093 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
index a7abaed9e0..7bf36a5900 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
// We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
index 446fcf8707..afd2427b85 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
// We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
index b86204043c..e49ebbd84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
@@ -30,17 +30,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics. The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
+class cls_a64_gemm_u16_8x12 {
public:
typedef uint16_t operand_type;
typedef uint32_t result_type;
@@ -62,10 +54,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_u16_asimd_12x8;
+ kern_type kernel = a64_gemm_u16_asimd_8x12;
- gemm_u16_12x8(const CPUInfo *) { }
+ cls_a64_gemm_u16_8x12(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
index 66f0b7c0ac..98da7830f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_u16_asimd_8x12(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
{
const uint16_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 134007b74c..854b6751c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -32,7 +32,7 @@ namespace arm_gemm {
// Kernel definition
void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
-class gemm_u8_4x4 {
+class cls_a64_gemm_u8_4x4 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -64,10 +64,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
kern_type kernel = a64_gemm_u8_4x4;
- gemm_u8_4x4(const CPUInfo *) { }
+ cls_a64_gemm_u8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index c0990ecd57..256ba2e08c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -30,11 +30,11 @@
namespace arm_gemm {
// Load the actual kernel
-void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class gemm_u8_12x8 {
+class cls_a64_gemm_u8_8x12 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -66,16 +66,17 @@ public:
// Use the standard fixed sized transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_u8_12x8;
+ kern_type kernel = a64_gemm_u8_8x12;
- gemm_u8_12x8(const CPUInfo *ci) {
+ cls_a64_gemm_u8_8x12(const CPUInfo *ci) {
auto mod = ci->get_cpu_model();
if (mod == CPUModel::A55r1) {
- kernel = a64_gemm_u8_12x8_a55r1;
+ kernel = a64_gemm_u8_8x12_a55r1;
} else if (mod == CPUModel::X1) {
- kernel = a64_gemm_u8_12x8_x1;
+ kernel = a64_gemm_u8_8x12_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
index c9a8a8229c..63869c9fd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
index 821e742f90..ff60cbc905 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
// We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
index 7fac67354f..1c1196b7a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
// We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
index b60401b70d..b53172509e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
@@ -25,32 +25,26 @@
#ifdef __aarch64__
-
+#include "../performance_parameters.hpp"
#include "../std_transforms_fixed.hpp"
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-class hybrid_fp32_mla_4x8
+class cls_a64_gemv_fp32_mla_32
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
+ typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
static unsigned int out_width()
{
- return 4;
+ return 32;
}
static constexpr unsigned int k_unroll()
@@ -73,14 +67,13 @@ public:
return true;
}
- StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_fp32_mla_4x8;
+ kern_type kernel=a64_gemv_fp32_mla_32;
- hybrid_fp32_mla_4x8(const CPUInfo *)
+ cls_a64_gemv_fp32_mla_32(const CPUInfo *)
{
-
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
new file mode 100644
index 0000000000..a2af8d6d14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_gemv_fp32_mla_32 (
+ const float *A_ptr, const float *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "add x22, %x[N], #0x3\n"
+ "mov x21, %x[bias]\n"
+ "lsr x22, x22, #0x2\n"
+ "1:" // Column loop
+ "cmp x22, #0x8\n"
+ "bge 85f\n"
+ "cmp x22, #0x6\n"
+ "bgt 73f\n"
+ "beq 61f\n"
+ "cmp x22, #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp x22, #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 2f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "b 3f\n"
+ "2:" // Width 1: no bias
+ "movi v24.16b, #0x0\n"
+ "3:" // Width 1: setup done
+ "cmp x20, #0x4\n"
+ "blt 6f\n"
+ "cmp x20, #0x8\n"
+ "blt 5f\n"
+ "4:" // Width 1: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v4.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 4b\n"
+ "5:" // Width 1: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "6:" // Width 1: Multiply loop: Main loop skip
+ "cbz x20, 8f\n"
+ "7:" // Width 1: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 7b\n"
+ "8:" // Width 1: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 9f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "9:" // Width 1: No activation
+ "cmp %x[N], #0x4\n"
+ "blt 10f\n"
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 12f\n"
+ "10:" // Width 1: Partial writeback
+ "tbz %x[N], #1, 11f\n"
+ "str d24, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 12f\n"
+ "st1 { v24.s }[2], [%x[output_ptr]]\n"
+ "b 12f\n"
+ "11:" // Width 1: Partial direct writeback: partial_1_0
+ "str s24, [%x[output_ptr], #0x0]\n"
+ "12:" // Width 1: Writeback done
+ "b 97f\n"
+ "13:" // Width 2
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 14f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "b 15f\n"
+ "14:" // Width 2: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "15:" // Width 2: setup done
+ "cmp x20, #0x4\n"
+ "blt 18f\n"
+ "cmp x20, #0x8\n"
+ "blt 17f\n"
+ "16:" // Width 2: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v4.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q6, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v6.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 16b\n"
+ "17:" // Width 2: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v10.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v11.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v12.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v14.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q15, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q16, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v16.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "18:" // Width 2: Multiply loop: Main loop skip
+ "cbz x20, 20f\n"
+ "19:" // Width 2: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v17.4s, v0.s[0]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v18.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 19b\n"
+ "20:" // Width 2: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 21f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "21:" // Width 2: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "cmp %x[N], #0x8\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "blt 22f\n"
+ "str q25, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 24f\n"
+ "22:" // Width 2: Partial writeback
+ "tbz %x[N], #1, 23f\n"
+ "str d25, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 24f\n"
+ "st1 { v25.s }[2], [%x[output_ptr]]\n"
+ "b 24f\n"
+ "23:" // Width 2: Partial direct writeback: partial_1_4
+ "tbz %x[N], #0, 24f\n"
+ "str s25, [%x[output_ptr], #0x0]\n"
+ "24:" // Width 2: Writeback done
+ "b 97f\n"
+ "25:" // Width 3
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 26f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "add x21, x21, #0x30\n"
+ "b 27f\n"
+ "26:" // Width 3: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "27:" // Width 3: setup done
+ "cmp x20, #0x4\n"
+ "blt 30f\n"
+ "cmp x20, #0x8\n"
+ "blt 29f\n"
+ "28:" // Width 3: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v4.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v5.4s, v0.s[1]\n"
+ "ldr q6, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v6.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v8.4s, v0.s[2]\n"
+ "ldr q9, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v9.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v11.4s, v0.s[3]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v12.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 28b\n"
+ "29:" // Width 3: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v14.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v15.4s, v0.s[0]\n"
+ "ldr q16, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q17, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v17.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v18.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v19.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v20.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v21.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q22, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v22.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q23, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v23.4s, v0.s[3]\n"
+ "ldr q1, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v1.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "30:" // Width 3: Multiply loop: Main loop skip
+ "cbz x20, 32f\n"
+ "31:" // Width 3: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[0]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v4.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 31b\n"
+ "32:" // Width 3: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 33f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "33:" // Width 3: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "cmp %x[N], #0xc\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "blt 34f\n"
+ "str q26, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 36f\n"
+ "34:" // Width 3: Partial writeback
+ "tbz %x[N], #1, 35f\n"
+ "str d26, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 36f\n"
+ "st1 { v26.s }[2], [%x[output_ptr]]\n"
+ "b 36f\n"
+ "35:" // Width 3: Partial direct writeback: partial_1_8
+ "tbz %x[N], #0, 36f\n"
+ "str s26, [%x[output_ptr], #0x0]\n"
+ "36:" // Width 3: Writeback done
+ "b 97f\n"
+ "37:" // Width 4
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 38f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "b 39f\n"
+ "38:" // Width 4: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "39:" // Width 4: setup done
+ "cmp x20, #0x4\n"
+ "blt 42f\n"
+ "cmp x20, #0x8\n"
+ "blt 41f\n"
+ "40:" // Width 4: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[1]\n"
+ "ldr q6, [%x[B_ptr], #0x10]\n"
+ "ldr q7, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v7.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "ldr q11, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v10.4s, v0.s[2]\n"
+ "ldr q12, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v11.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v12.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[3]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v14.4s, v0.s[3]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v15.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "sub x20, x20, #0x4\n"
+ "cmp x20, #0x8\n"
+ "bge 40b\n"
+ "41:" // Width 4: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v17.4s, v0.s[0]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "ldr q19, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v18.4s, v0.s[0]\n"
+ "ldr q20, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v19.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v20.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q21, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v21.4s, v0.s[1]\n"
+ "ldr q22, [%x[B_ptr], #0x10]\n"
+ "ldr q23, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v22.4s, v0.s[1]\n"
+ "ldr q1, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v23.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v1.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[2]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v4.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v5.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[3]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v7.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v8.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v9.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "42:" // Width 4: Multiply loop: Main loop skip
+ "cbz x20, 44f\n"
+ "43:" // Width 4: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[0]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v11.4s, v0.s[0]\n"
+ "ldr q13, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v12.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v27.4s, v13.4s, v0.s[0]\n"
+ "cbnz x20, 43b\n"
+ "44:" // Width 4: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 45f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "45:" // Width 4: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "cmp %x[N], #0x10\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "blt 46f\n"
+ "str q27, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 48f\n"
+ "46:" // Width 4: Partial writeback
+ "tbz %x[N], #1, 47f\n"
+ "str d27, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 48f\n"
+ "st1 { v27.s }[2], [%x[output_ptr]]\n"
+ "b 48f\n"
+ "47:" // Width 4: Partial direct writeback: partial_1_12
+ "tbz %x[N], #0, 48f\n"
+ "str s27, [%x[output_ptr], #0x0]\n"
+ "48:" // Width 4: Writeback done
+ "b 97f\n"
+ "49:" // Width 5
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 50f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "add x21, x21, #0x50\n"
+ "b 51f\n"
+ "50:" // Width 5: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "51:" // Width 5: setup done
+ "cmp x20, #0x4\n"
+ "blt 54f\n"
+ "cmp x20, #0x8\n"
+ "blt 53f\n"
+ "52:" // Width 5: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v7.4s, v0.s[1]\n"
+ "ldr q10, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v8.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v9.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v11.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v12.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v13.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v14.4s, v0.s[2]\n"
+ "ldr q16, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v15.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q17, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v16.4s, v0.s[3]\n"
+ "ldr q18, [%x[B_ptr], #0x20]\n"
+ "ldr q19, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v17.4s, v0.s[3]\n"
+ "ldr q20, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v18.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v19.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v20.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 52b\n"
+ "53:" // Width 5: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q21, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v21.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x10]\n"
+ "ldr q23, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v22.4s, v0.s[0]\n"
+ "ldr q1, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v23.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v1.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v2.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v3.4s, v0.s[1]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v4.4s, v0.s[1]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v5.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v7.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v8.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v9.4s, v0.s[2]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v10.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v11.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v12.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v13.4s, v0.s[3]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v14.4s, v0.s[3]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v15.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v17.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "54:" // Width 5: Multiply loop: Main loop skip
+ "cbz x20, 56f\n"
+ "55:" // Width 5: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q18, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v18.4s, v0.s[0]\n"
+ "ldr q19, [%x[B_ptr], #0x10]\n"
+ "ldr q20, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v19.4s, v0.s[0]\n"
+ "ldr q21, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v20.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v21.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v28.4s, v22.4s, v0.s[0]\n"
+ "cbnz x20, 55b\n"
+ "56:" // Width 5: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 57f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "57:" // Width 5: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "cmp %x[N], #0x14\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "blt 58f\n"
+ "str q28, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 60f\n"
+ "58:" // Width 5: Partial writeback
+ "tbz %x[N], #1, 59f\n"
+ "str d28, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 60f\n"
+ "st1 { v28.s }[2], [%x[output_ptr]]\n"
+ "b 60f\n"
+ "59:" // Width 5: Partial direct writeback: partial_1_16
+ "tbz %x[N], #0, 60f\n"
+ "str s28, [%x[output_ptr], #0x0]\n"
+ "60:" // Width 5: Writeback done
+ "b 97f\n"
+ "61:" // Width 6
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 62f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "add x21, x21, #0x60\n"
+ "b 63f\n"
+ "62:" // Width 6: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "63:" // Width 6: setup done
+ "cmp x20, #0x4\n"
+ "blt 66f\n"
+ "cmp x20, #0x8\n"
+ "blt 65f\n"
+ "64:" // Width 6: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v7.4s, v0.s[1]\n"
+ "ldr q9, [%x[B_ptr], #0x20]\n"
+ "ldr q10, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v8.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v10.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v11.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v12.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v13.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v14.4s, v0.s[2]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "ldr q18, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v15.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[2]\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v17.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v18.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "ldr q22, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v19.4s, v0.s[3]\n"
+ "ldr q23, [%x[B_ptr], #0x40]\n"
+ "ldr q1, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v20.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v21.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v27.4s, v22.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v28.4s, v23.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "fmla v29.4s, v1.4s, v0.s[3]\n"
+ "bge 64b\n"
+ "65:" // Width 6: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[0]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v4.4s, v0.s[0]\n"
+ "ldr q6, [%x[B_ptr], #0x40]\n"
+ "ldr q7, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v5.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v6.4s, v0.s[0]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v8.4s, v0.s[1]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v10.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v11.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v12.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q15, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "ldr q16, [%x[B_ptr], #0x20]\n"
+ "ldr q17, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v15.4s, v0.s[2]\n"
+ "ldr q18, [%x[B_ptr], #0x40]\n"
+ "ldr q19, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v16.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v17.4s, v0.s[2]\n"
+ "ldr q20, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v18.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q21, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v19.4s, v0.s[2]\n"
+ "ldr q22, [%x[B_ptr], #0x20]\n"
+ "ldr q23, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v20.4s, v0.s[3]\n"
+ "ldr q1, [%x[B_ptr], #0x40]\n"
+ "ldr q2, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v21.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v22.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v27.4s, v23.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v28.4s, v1.4s, v0.s[3]\n"
+ "fmla v29.4s, v2.4s, v0.s[3]\n"
+ "66:" // Width 6: Multiply loop: Main loop skip
+ "cbz x20, 68f\n"
+ "67:" // Width 6: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v4.4s, v0.s[0]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v5.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "ldr q8, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v6.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v28.4s, v7.4s, v0.s[0]\n"
+ "fmla v29.4s, v8.4s, v0.s[0]\n"
+ "cbnz x20, 67b\n"
+ "68:" // Width 6: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "69:" // Width 6: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "cmp %x[N], #0x18\n"
+ "add %x[output_ptr], %x[output_ptr], #0x50\n"
+ "blt 70f\n"
+ "str q29, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 72f\n"
+ "70:" // Width 6: Partial writeback
+ "tbz %x[N], #1, 71f\n"
+ "str d29, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 72f\n"
+ "st1 { v29.s }[2], [%x[output_ptr]]\n"
+ "b 72f\n"
+ "71:" // Width 6: Partial direct writeback: partial_1_20
+ "tbz %x[N], #0, 72f\n"
+ "str s29, [%x[output_ptr], #0x0]\n"
+ "72:" // Width 6: Writeback done
+ "b 97f\n"
+ "73:" // Width 7
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 74f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "ldr q30, [x21, #0x60]\n"
+ "add x21, x21, #0x70\n"
+ "b 75f\n"
+ "74:" // Width 7: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "75:" // Width 7: setup done
+ "cmp x20, #0x4\n"
+ "blt 78f\n"
+ "cmp x20, #0x8\n"
+ "blt 77f\n"
+ "76:" // Width 7: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v7.4s, v0.s[0]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v8.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v9.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v10.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v11.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v12.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q15, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v13.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x10]\n"
+ "ldr q17, [%x[B_ptr], #0x20]\n"
+ "fmla v30.4s, v14.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v15.4s, v0.s[2]\n"
+ "ldr q19, [%x[B_ptr], #0x40]\n"
+ "ldr q20, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v16.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v17.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v18.4s, v0.s[2]\n"
+ "ldr q22, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v19.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q23, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v20.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x20]\n"
+ "ldr q2, [%x[B_ptr], #0x30]\n"
+ "fmla v30.4s, v21.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v22.4s, v0.s[3]\n"
+ "ldr q4, [%x[B_ptr], #0x50]\n"
+ "ldr q5, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v23.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v1.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v2.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v3.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v29.4s, v4.4s, v0.s[3]\n"
+ "cmp x20, #0x8\n"
+ "fmla v30.4s, v5.4s, v0.s[3]\n"
+ "bge 76b\n"
+ "77:" // Width 7: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v7.4s, v0.s[0]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v8.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x40]\n"
+ "ldr q11, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v9.4s, v0.s[0]\n"
+ "ldr q12, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v11.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v14.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "ldr q19, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v16.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v17.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v18.4s, v0.s[1]\n"
+ "ldr q21, [%x[B_ptr], #0x10]\n"
+ "ldr q22, [%x[B_ptr], #0x20]\n"
+ "fmla v30.4s, v19.4s, v0.s[1]\n"
+ "ldr q23, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v20.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x40]\n"
+ "ldr q2, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v21.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v22.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v23.4s, v0.s[2]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v1.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v2.4s, v0.s[2]\n"
+ "ldr q6, [%x[B_ptr], #0x20]\n"
+ "ldr q7, [%x[B_ptr], #0x30]\n"
+ "fmla v30.4s, v3.4s, v0.s[2]\n"
+ "ldr q8, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v4.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x50]\n"
+ "ldr q10, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v5.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v6.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v29.4s, v9.4s, v0.s[3]\n"
+ "fmla v30.4s, v10.4s, v0.s[3]\n"
+ "78:" // Width 7: Multiply loop: Main loop skip
+ "cbz x20, 80f\n"
+ "79:" // Width 7: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v11.4s, v0.s[0]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v13.4s, v0.s[0]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "ldr q16, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v14.4s, v0.s[0]\n"
+ "ldr q17, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v15.4s, v0.s[0]\n"
+ "fmla v29.4s, v16.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v30.4s, v17.4s, v0.s[0]\n"
+ "cbnz x20, 79b\n"
+ "80:" // Width 7: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 81f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "81:" // Width 7: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "str q29, [%x[output_ptr], #0x50]\n"
+ "cmp %x[N], #0x1c\n"
+ "add %x[output_ptr], %x[output_ptr], #0x60\n"
+ "blt 82f\n"
+ "str q30, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 84f\n"
+ "82:" // Width 7: Partial writeback
+ "tbz %x[N], #1, 83f\n"
+ "str d30, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 84f\n"
+ "st1 { v30.s }[2], [%x[output_ptr]]\n"
+ "b 84f\n"
+ "83:" // Width 7: Partial direct writeback: partial_1_24
+ "tbz %x[N], #0, 84f\n"
+ "str s30, [%x[output_ptr], #0x0]\n"
+ "84:" // Width 7: Writeback done
+ "b 97f\n"
+ "85:" // Width 8
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 86f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "ldr q30, [x21, #0x60]\n"
+ "ldr q31, [x21, #0x70]\n"
+ "add x21, x21, #0x80\n"
+ "b 87f\n"
+ "86:" // Width 8: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "87:" // Width 8: setup done
+ "cmp x20, #0x4\n"
+ "blt 90f\n"
+ "cmp x20, #0x8\n"
+ "blt 89f\n"
+ "88:" // Width 8: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x60]\n"
+ "ldr q8, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v30.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v31.4s, v8.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "ldr q11, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x30]\n"
+ "ldr q13, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v10.4s, v0.s[1]\n"
+ "fmla v26.4s, v11.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x50]\n"
+ "ldr q15, [%x[B_ptr], #0x60]\n"
+ "fmla v27.4s, v12.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v13.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v14.4s, v0.s[1]\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v15.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "fmla v31.4s, v16.4s, v0.s[1]\n"
+ "ldr q19, [%x[B_ptr], #0x20]\n"
+ "ldr q20, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v17.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x40]\n"
+ "ldr q22, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v18.4s, v0.s[2]\n"
+ "ldr q23, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v19.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x70]\n"
+ "fmla v27.4s, v20.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v21.4s, v0.s[2]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v22.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "fmla v30.4s, v23.4s, v0.s[2]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v31.4s, v1.4s, v0.s[2]\n"
+ "ldr q6, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v2.4s, v0.s[3]\n"
+ "ldr q7, [%x[B_ptr], #0x50]\n"
+ "ldr q8, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v3.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x70]\n"
+ "fmla v26.4s, v4.4s, v0.s[3]\n"
+ "fmla v27.4s, v5.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v6.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v29.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v30.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "fmla v31.4s, v9.4s, v0.s[3]\n"
+ "bge 88b\n"
+ "89:" // Width 8: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[0]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v11.4s, v0.s[0]\n"
+ "ldr q13, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x40]\n"
+ "ldr q15, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v13.4s, v0.s[0]\n"
+ "ldr q16, [%x[B_ptr], #0x60]\n"
+ "ldr q17, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v14.4s, v0.s[0]\n"
+ "fmla v29.4s, v15.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v30.4s, v16.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q18, [%x[B_ptr], #0x0]\n"
+ "fmla v31.4s, v17.4s, v0.s[0]\n"
+ "ldr q19, [%x[B_ptr], #0x10]\n"
+ "ldr q20, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "ldr q21, [%x[B_ptr], #0x30]\n"
+ "ldr q22, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v19.4s, v0.s[1]\n"
+ "fmla v26.4s, v20.4s, v0.s[1]\n"
+ "ldr q23, [%x[B_ptr], #0x50]\n"
+ "ldr q1, [%x[B_ptr], #0x60]\n"
+ "fmla v27.4s, v21.4s, v0.s[1]\n"
+ "ldr q2, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v22.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v23.4s, v0.s[1]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v1.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v31.4s, v2.4s, v0.s[1]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v3.4s, v0.s[2]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "ldr q8, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v4.4s, v0.s[2]\n"
+ "ldr q9, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v5.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x70]\n"
+ "fmla v27.4s, v6.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v7.4s, v0.s[2]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v8.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v30.4s, v9.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v31.4s, v10.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q16, [%x[B_ptr], #0x50]\n"
+ "ldr q17, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v12.4s, v0.s[3]\n"
+ "ldr q18, [%x[B_ptr], #0x70]\n"
+ "fmla v26.4s, v13.4s, v0.s[3]\n"
+ "fmla v27.4s, v14.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v15.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v29.4s, v16.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla v30.4s, v17.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v31.4s, v18.4s, v0.s[3]\n"
+ "90:" // Width 8: Multiply loop: Main loop skip
+ "cbz x20, 92f\n"
+ "91:" // Width 8: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v19.4s, v0.s[0]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v20.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v21.4s, v0.s[0]\n"
+ "ldr q23, [%x[B_ptr], #0x40]\n"
+ "ldr q1, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v22.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x60]\n"
+ "ldr q3, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v23.4s, v0.s[0]\n"
+ "fmla v29.4s, v1.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v30.4s, v2.4s, v0.s[0]\n"
+ "fmla v31.4s, v3.4s, v0.s[0]\n"
+ "cbnz x20, 91b\n"
+ "92:" // Width 8: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 93f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v17.4s\n"
+ "93:" // Width 8: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "str q29, [%x[output_ptr], #0x50]\n"
+ "str q30, [%x[output_ptr], #0x60]\n"
+ "cmp %x[N], #0x20\n"
+ "add %x[output_ptr], %x[output_ptr], #0x70\n"
+ "blt 94f\n"
+ "str q31, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 96f\n"
+ "94:" // Width 8: Partial writeback
+ "tbz %x[N], #1, 95f\n"
+ "str d31, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 96f\n"
+ "st1 { v31.s }[2], [%x[output_ptr]]\n"
+ "b 96f\n"
+ "95:" // Width 8: Partial direct writeback: partial_1_28
+ "tbz %x[N], #0, 96f\n"
+ "str s31, [%x[output_ptr], #0x0]\n"
+ "96:" // Width 8: Writeback done
+ "subs x22, x22, #0x8\n"
+ "sub %x[N], %x[N], #0x20\n"
+ "bgt 1b\n"
+ "97:" // Exit
+
+ : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+ : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 79cae6002a..73fb5b7122 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -30,15 +30,15 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-// 24x8 HGEMM "strategy" class. Describes the kernel properties.
+// 8x24 HGEMM "strategy" class. Describes the kernel properties.
//
// The generic "gemm_opt" function will instantiate one of these (allowing
// the constructor to pick a kernel implementation).
-class hgemm_24x8 {
+class cls_a64_hgemm_8x24 {
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
@@ -62,15 +62,15 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
// Default to the generic kernel
- kern_type kernel = a64_hgemm_asimd_24x8;
+ kern_type kernel = a64_hgemm_asimd_8x24;
- hgemm_24x8(const CPUInfo *ci) {
+ cls_a64_hgemm_8x24(const CPUInfo *ci) {
auto model = ci->get_cpu_model();
if (model == CPUModel::A55r1) {
- kernel = a64_hgemm_asimd_24x8_a55r1;
+ kernel = a64_hgemm_asimd_8x24_a55r1;
} else if (model == CPUModel::X1) {
- kernel = a64_hgemm_asimd_24x8_x1;
+ kernel = a64_hgemm_asimd_8x24_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 829ae30001..29cdd33893 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -41,7 +41,7 @@
namespace arm_gemm {
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index 657fade944..c9c48dd1c0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -34,14 +34,14 @@
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
namespace arm_gemm {
-void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index 3bb8334126..a6d2405e7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -34,14 +34,14 @@
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
namespace arm_gemm {
-void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
new file mode 100644
index 0000000000..a76c9949de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<bfloat16>, \
+ size_t, size_t, \
+ const bfloat16 *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_bf16fp32_dot_6x16
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
+
+ cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..be680ed645
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -0,0 +1,3668 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+ size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 186f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 149f\n"
+ "beq 112f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 75f\n"
+ "beq 38f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 15f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 14f\n"
+ "cmp x16, #0x10\n"
+ "bge 13f\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 12f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 12f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 7f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 12f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 12f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 10f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 12f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 12f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 11f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 12f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "12:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 15f\n"
+ "13:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 15f\n"
+ "14:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "15:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "16:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 18f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 18f\n"
+ "17:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "18:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "blt 21f\n"
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "19:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "bge 19b\n"
+ "20:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "21:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 26f\n"
+ "cmp x11, #0x2\n"
+ "blt 23f\n"
+ "22:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "cmp x11, #0x2\n"
+ "bge 22b\n"
+ "cbz x11, 26f\n"
+ "23:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 24f\n"
+ "ldr s0, [x10], #0x4\n"
+ "tbz x11, #0, 25f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "b 25f\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "25:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "26:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 16b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "27:" // Height 1: No activation
+ "cmp x16, #0x10\n"
+ "bge 36f\n"
+ "tbz x16, #3, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 29f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 28f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 35f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 35f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 35f\n"
+ "29:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 30f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 35f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 35f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 35f\n"
+ "31:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 33f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 32f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 35f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 35f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 35f\n"
+ "33:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 34f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 35f\n"
+ "34:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "35:" // Height 1: Partial direct writeback: Done
+ "b 37f\n"
+ "36:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "37:" // Height 1: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 3b\n"
+ "b 224f\n"
+ "38:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 39f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 40f\n"
+ "39:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "40:" // Height 2: Column loop
+ "cbz x14, 41f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 52f\n"
+ "41:" // Height 2: no bias
+ "tbz %x[flags], #0, 51f\n"
+ "cmp x16, #0x10\n"
+ "bge 50f\n"
+ "tbz x16, #3, 45f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 43f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 42f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 49f\n"
+ "42:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 49f\n"
+ "43:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 44f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 49f\n"
+ "44:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 49f\n"
+ "45:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 47f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 46f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 49f\n"
+ "46:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 49f\n"
+ "47:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 48f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 49f\n"
+ "48:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "49:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 52f\n"
+ "50:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 52f\n"
+ "51:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "52:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "53:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 54f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 55f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 55f\n"
+ "54:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "55:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "blt 58f\n"
+ "cmp x11, #0x10\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "bge 56b\n"
+ "57:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "58:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 63f\n"
+ "cmp x11, #0x2\n"
+ "blt 60f\n"
+ "59:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "bge 59b\n"
+ "cbz x11, 63f\n"
+ "60:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 61f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "tbz x11, #0, 62f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "b 62f\n"
+ "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "62:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "63:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 53b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 64f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "64:" // Height 2: No activation
+ "cmp x16, #0x10\n"
+ "bge 73f\n"
+ "tbz x16, #3, 68f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 66f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 65f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 72f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 72f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 72f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 67f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 72f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 72f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 72f\n"
+ "68:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 70f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 69f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 72f\n"
+ "69:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 72f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 72f\n"
+ "70:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 71f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 72f\n"
+ "71:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "72:" // Height 2: Partial direct writeback: Done
+ "b 74f\n"
+ "73:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "74:" // Height 2: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 40b\n"
+ "b 224f\n"
+ "75:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 76f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 77f\n"
+ "76:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "77:" // Height 3: Column loop
+ "cbz x14, 78f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 89f\n"
+ "78:" // Height 3: no bias
+ "tbz %x[flags], #0, 88f\n"
+ "cmp x16, #0x10\n"
+ "bge 87f\n"
+ "tbz x16, #3, 82f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 80f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 79f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 86f\n"
+ "79:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 86f\n"
+ "80:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 81f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 86f\n"
+ "81:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 86f\n"
+ "82:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 84f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 83f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 86f\n"
+ "83:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 86f\n"
+ "84:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 85f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 86f\n"
+ "85:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "86:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 89f\n"
+ "87:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 89f\n"
+ "88:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "89:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "90:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 91f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 92f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 92f\n"
+ "91:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "92:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "blt 95f\n"
+ "cmp x11, #0x10\n"
+ "blt 94f\n"
+ "93:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "bge 93b\n"
+ "94:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "95:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 100f\n"
+ "cmp x11, #0x2\n"
+ "blt 97f\n"
+ "96:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "bge 96b\n"
+ "cbz x11, 100f\n"
+ "97:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 98f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "tbz x11, #0, 99f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "b 99f\n"
+ "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "99:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "100:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 90b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 101f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "101:" // Height 3: No activation
+ "cmp x16, #0x10\n"
+ "bge 110f\n"
+ "tbz x16, #3, 105f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 103f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 102f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 109f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 109f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 109f\n"
+ "103:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 104f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 109f\n"
+ "104:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 109f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 109f\n"
+ "105:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 107f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 106f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 109f\n"
+ "106:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 109f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 109f\n"
+ "107:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 108f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 109f\n"
+ "108:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "109:" // Height 3: Partial direct writeback: Done
+ "b 111f\n"
+ "110:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "111:" // Height 3: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 77b\n"
+ "b 224f\n"
+ "112:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 113f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 114f\n"
+ "113:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "114:" // Height 4: Column loop
+ "cbz x14, 115f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 126f\n"
+ "115:" // Height 4: no bias
+ "tbz %x[flags], #0, 125f\n"
+ "cmp x16, #0x10\n"
+ "bge 124f\n"
+ "tbz x16, #3, 119f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 117f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 116f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 123f\n"
+ "116:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 123f\n"
+ "117:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 118f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 123f\n"
+ "118:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 123f\n"
+ "119:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 121f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 120f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 123f\n"
+ "120:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 123f\n"
+ "121:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 122f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 123f\n"
+ "122:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "123:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 126f\n"
+ "124:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 126f\n"
+ "125:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "126:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "127:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 128f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 129f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 129f\n"
+ "128:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "129:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "blt 132f\n"
+ "cmp x11, #0x10\n"
+ "blt 131f\n"
+ "130:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "bge 130b\n"
+ "131:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "132:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 137f\n"
+ "cmp x11, #0x2\n"
+ "blt 134f\n"
+ "133:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "bge 133b\n"
+ "cbz x11, 137f\n"
+ "134:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 135f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "tbz x11, #0, 136f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "b 136f\n"
+ "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "136:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "137:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 127b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 138f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "138:" // Height 4: No activation
+ "cmp x16, #0x10\n"
+ "bge 147f\n"
+ "tbz x16, #3, 142f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 140f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 139f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 146f\n"
+ "139:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 146f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 141f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 146f\n"
+ "141:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 146f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 144f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 143f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 146f\n"
+ "143:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 146f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 145f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 146f\n"
+ "145:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "146:" // Height 4: Partial direct writeback: Done
+ "b 148f\n"
+ "147:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "148:" // Height 4: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 114b\n"
+ "b 224f\n"
+ "149:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 150f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 151f\n"
+ "150:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "151:" // Height 5: Column loop
+ "cbz x14, 152f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 163f\n"
+ "152:" // Height 5: no bias
+ "tbz %x[flags], #0, 162f\n"
+ "cmp x16, #0x10\n"
+ "bge 161f\n"
+ "tbz x16, #3, 156f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 154f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 153f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 160f\n"
+ "153:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 160f\n"
+ "154:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 155f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 160f\n"
+ "155:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 160f\n"
+ "156:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 158f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 157f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 160f\n"
+ "157:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 160f\n"
+ "158:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 159f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 160f\n"
+ "159:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "160:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 163f\n"
+ "161:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 163f\n"
+ "162:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "163:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "164:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 165f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 166f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 166f\n"
+ "165:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "166:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "blt 169f\n"
+ "cmp x11, #0x10\n"
+ "blt 168f\n"
+ "167:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ "bge 167b\n"
+ "168:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ "169:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 174f\n"
+ "cmp x11, #0x2\n"
+ "blt 171f\n"
+ "170:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "bge 170b\n"
+ "cbz x11, 174f\n"
+ "171:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 172f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "tbz x11, #0, 173f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "ld1 { v4.h }[2], [x22]\n"
+ "b 173f\n"
+ "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "ldr h4, [x22, #0x0]\n"
+ "173:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "174:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 164b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 175f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "175:" // Height 5: No activation
+ "cmp x16, #0x10\n"
+ "bge 184f\n"
+ "tbz x16, #3, 179f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 177f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 176f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 183f\n"
+ "176:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 183f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 183f\n"
+ "177:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 178f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 183f\n"
+ "178:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 183f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 183f\n"
+ "179:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 181f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 180f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 183f\n"
+ "180:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 183f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 183f\n"
+ "181:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 182f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 183f\n"
+ "182:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "183:" // Height 5: Partial direct writeback: Done
+ "b 185f\n"
+ "184:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "185:" // Height 5: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 151b\n"
+ "b 224f\n"
+ "186:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 187f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 188f\n"
+ "187:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "188:" // Height 6: Column loop
+ "cbz x14, 189f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 200f\n"
+ "189:" // Height 6: no bias
+ "tbz %x[flags], #0, 199f\n"
+ "cmp x16, #0x10\n"
+ "bge 198f\n"
+ "tbz x16, #3, 193f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 191f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 190f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 197f\n"
+ "190:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 197f\n"
+ "191:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 192f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 197f\n"
+ "192:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 197f\n"
+ "193:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 195f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 194f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 197f\n"
+ "194:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 197f\n"
+ "195:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 196f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 197f\n"
+ "196:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "197:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 200f\n"
+ "198:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 200f\n"
+ "199:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "200:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "201:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 202f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 203f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 203f\n"
+ "202:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "203:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "blt 206f\n"
+ "cmp x11, #0x10\n"
+ "blt 205f\n"
+ "204:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
+ "bge 204b\n"
+ "205:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
+ "206:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 211f\n"
+ "cmp x11, #0x2\n"
+ "blt 208f\n"
+ "207:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "bge 207b\n"
+ "cbz x11, 211f\n"
+ "208:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 209f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "tbz x11, #0, 210f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "ld1 { v4.h }[2], [x22]\n"
+ "ld1 { v5.h }[2], [x20]\n"
+ "b 210f\n"
+ "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "ldr h4, [x22, #0x0]\n"
+ "ldr h5, [x20, #0x0]\n"
+ "210:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "211:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 201b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 212f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "fmax v28.4s, v28.4s, v1.4s\n"
+ "fmax v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
+ "fmax v30.4s, v30.4s, v1.4s\n"
+ "fmax v31.4s, v31.4s, v1.4s\n"
+ "212:" // Height 6: No activation
+ "cmp x16, #0x10\n"
+ "bge 221f\n"
+ "tbz x16, #3, 216f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 214f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 213f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 220f\n"
+ "213:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 220f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 220f\n"
+ "214:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 215f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 220f\n"
+ "215:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 220f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 220f\n"
+ "216:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 218f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 217f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 220f\n"
+ "217:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 220f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 220f\n"
+ "218:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 219f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 220f\n"
+ "219:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "220:" // Height 6: Partial direct writeback: Done
+ "b 222f\n"
+ "221:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "222:" // Height 6: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 188b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 224f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 223f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "223:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "224:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
new file mode 100644
index 0000000000..46de98504e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<__fp16>, \
+ size_t, size_t, \
+ const __fp16 *, \
+ IndirectOutputArg<__fp16>, \
+ const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp16_mla_6x32( ARGLIST );
+
+class cls_a64_hybrid_fp16_mla_6x32
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 32;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp16_mla_6x32;
+
+ cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
new file mode 100644
index 0000000000..ff6cbec200
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -0,0 +1,5400 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp16_mla_6x32 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+ size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+ const __fp16 *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const __fp16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<__fp16>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ ".arch armv8.2-a+fp16\n"
+#endif
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 251f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 201f\n"
+ "beq 151f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 101f\n"
+ "beq 51f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 23f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 22f\n"
+ "cmp x16, #0x20\n"
+ "bge 21f\n"
+ "tbz x16, #4, 12f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "b 20f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "b 20f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_24
+ "tbz x16, #1, 7f\n"
+ "ldr s11, [x13], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "b 20f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "b 20f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_16
+ "tbz x16, #2, 10f\n"
+ "ldr d10, [x13], #0x8\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "b 20f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "b 20f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_16
+ "tbz x16, #1, 11f\n"
+ "ldr s10, [x13], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "b 20f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "b 20f\n"
+ "12:" // Height 1: Partial accumulate: partial_8_0
+ "tbz x16, #3, 16f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 14f\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #1, 13f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "b 20f\n"
+ "13:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "b 20f\n"
+ "14:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 15f\n"
+ "ldr s9, [x13], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "b 20f\n"
+ "15:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "b 20f\n"
+ "16:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 18f\n"
+ "ldr d8, [x13], #0x8\n"
+ "tbz x16, #1, 17f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "b 20f\n"
+ "17:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "b 20f\n"
+ "18:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 19f\n"
+ "ldr s8, [x13], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "b 20f\n"
+ "19:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "20:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 23f\n"
+ "21:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 23f\n"
+ "22:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "23:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "24:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 25f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 26f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 26f\n"
+ "25:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "26:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "blt 29f\n"
+ "cmp x11, #0x10\n"
+ "blt 28f\n"
+ "27:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x10\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "bge 27b\n"
+ "28:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "29:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 31f\n"
+ "30:" // Height 1: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "add x15, x15, #0x40\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "cbnz x11, 30b\n"
+ "31:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 24b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 32f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "32:" // Height 1: No activation
+ "cmp x16, #0x20\n"
+ "bge 49f\n"
+ "tbz x16, #4, 40f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "tbz x16, #3, 36f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 34f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #1, 33f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "b 48f\n"
+ "33:" // Height 1: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "b 48f\n"
+ "34:" // Height 1: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 35f\n"
+ "str s11, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "b 48f\n"
+ "35:" // Height 1: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 48f\n"
+ "str h11, [x13, #0x0]\n"
+ "b 48f\n"
+ "36:" // Height 1: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 38f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #1, 37f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "b 48f\n"
+ "37:" // Height 1: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "b 48f\n"
+ "38:" // Height 1: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 39f\n"
+ "str s10, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "b 48f\n"
+ "39:" // Height 1: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 48f\n"
+ "str h10, [x13, #0x0]\n"
+ "b 48f\n"
+ "40:" // Height 1: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 44f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 42f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #1, 41f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "b 48f\n"
+ "41:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "b 48f\n"
+ "42:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 43f\n"
+ "str s9, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "b 48f\n"
+ "43:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 48f\n"
+ "str h9, [x13, #0x0]\n"
+ "b 48f\n"
+ "44:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 46f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #1, 45f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "b 48f\n"
+ "45:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "b 48f\n"
+ "46:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 47f\n"
+ "str s8, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "b 48f\n"
+ "47:" // Height 1: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "48:" // Height 1: Partial direct writeback: Done
+ "b 50f\n"
+ "49:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "50:" // Height 1: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 3b\n"
+ "b 302f\n"
+ "51:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 52f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "b 53f\n"
+ "52:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "53:" // Height 2: Column loop
+ "cbz x14, 54f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 73f\n"
+ "54:" // Height 2: no bias
+ "tbz %x[flags], #0, 72f\n"
+ "cmp x16, #0x20\n"
+ "bge 71f\n"
+ "tbz x16, #4, 62f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "tbz x16, #3, 58f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 56f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #1, 55f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "b 70f\n"
+ "55:" // Height 2: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "b 70f\n"
+ "56:" // Height 2: Partial accumulate: partial_2_24
+ "tbz x16, #1, 57f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "b 70f\n"
+ "57:" // Height 2: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "b 70f\n"
+ "58:" // Height 2: Partial accumulate: partial_4_16
+ "tbz x16, #2, 60f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "tbz x16, #1, 59f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "b 70f\n"
+ "59:" // Height 2: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "b 70f\n"
+ "60:" // Height 2: Partial accumulate: partial_2_16
+ "tbz x16, #1, 61f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "b 70f\n"
+ "61:" // Height 2: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "b 70f\n"
+ "62:" // Height 2: Partial accumulate: partial_8_0
+ "tbz x16, #3, 66f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 64f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #1, 63f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "b 70f\n"
+ "63:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "b 70f\n"
+ "64:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 65f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "b 70f\n"
+ "65:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "b 70f\n"
+ "66:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 68f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "tbz x16, #1, 67f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "b 70f\n"
+ "67:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "b 70f\n"
+ "68:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 69f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "b 70f\n"
+ "69:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "70:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 73f\n"
+ "71:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 73f\n"
+ "72:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "73:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "74:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 75f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 76f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 76f\n"
+ "75:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "76:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "blt 79f\n"
+ "cmp x11, #0x10\n"
+ "blt 78f\n"
+ "77:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "bge 77b\n"
+ "78:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "79:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 81f\n"
+ "80:" // Height 2: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "cbnz x11, 80b\n"
+ "81:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 74b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 82f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "82:" // Height 2: No activation
+ "cmp x16, #0x20\n"
+ "bge 99f\n"
+ "tbz x16, #4, 90f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "tbz x16, #3, 86f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 84f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #1, 83f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "b 98f\n"
+ "83:" // Height 2: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "b 98f\n"
+ "84:" // Height 2: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 85f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "b 98f\n"
+ "85:" // Height 2: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 98f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "b 98f\n"
+ "86:" // Height 2: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 88f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #1, 87f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "b 98f\n"
+ "87:" // Height 2: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "b 98f\n"
+ "88:" // Height 2: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 89f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "b 98f\n"
+ "89:" // Height 2: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 98f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "b 98f\n"
+ "90:" // Height 2: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 94f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 92f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #1, 91f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "b 98f\n"
+ "91:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "b 98f\n"
+ "92:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 93f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "b 98f\n"
+ "93:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 98f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "b 98f\n"
+ "94:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 96f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #1, 95f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "b 98f\n"
+ "95:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "b 98f\n"
+ "96:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 97f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "b 98f\n"
+ "97:" // Height 2: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "98:" // Height 2: Partial direct writeback: Done
+ "b 100f\n"
+ "99:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "100:" // Height 2: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 53b\n"
+ "b 302f\n"
+ "101:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 102f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "add x27, x27, x19, LSL #1\n"
+ "b 103f\n"
+ "102:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "103:" // Height 3: Column loop
+ "cbz x14, 104f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 123f\n"
+ "104:" // Height 3: no bias
+ "tbz %x[flags], #0, 122f\n"
+ "cmp x16, #0x20\n"
+ "bge 121f\n"
+ "tbz x16, #4, 112f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "tbz x16, #3, 108f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 106f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #1, 105f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "b 120f\n"
+ "105:" // Height 3: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "b 120f\n"
+ "106:" // Height 3: Partial accumulate: partial_2_24
+ "tbz x16, #1, 107f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "b 120f\n"
+ "107:" // Height 3: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "b 120f\n"
+ "108:" // Height 3: Partial accumulate: partial_4_16
+ "tbz x16, #2, 110f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "tbz x16, #1, 109f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "b 120f\n"
+ "109:" // Height 3: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "b 120f\n"
+ "110:" // Height 3: Partial accumulate: partial_2_16
+ "tbz x16, #1, 111f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "b 120f\n"
+ "111:" // Height 3: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "b 120f\n"
+ "112:" // Height 3: Partial accumulate: partial_8_0
+ "tbz x16, #3, 116f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 114f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #1, 113f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "b 120f\n"
+ "113:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "b 120f\n"
+ "114:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 115f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "b 120f\n"
+ "115:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "b 120f\n"
+ "116:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 118f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "tbz x16, #1, 117f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "b 120f\n"
+ "117:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "b 120f\n"
+ "118:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 119f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "b 120f\n"
+ "119:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "120:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 123f\n"
+ "121:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 123f\n"
+ "122:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "123:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "124:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 126f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 126f\n"
+ "125:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "126:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "blt 129f\n"
+ "cmp x11, #0x10\n"
+ "blt 128f\n"
+ "127:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "bge 127b\n"
+ "128:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "129:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 131f\n"
+ "130:" // Height 3: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "cbnz x11, 130b\n"
+ "131:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 124b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 132f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "132:" // Height 3: No activation
+ "cmp x16, #0x20\n"
+ "bge 149f\n"
+ "tbz x16, #4, 140f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "tbz x16, #3, 136f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 134f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #1, 133f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "b 148f\n"
+ "133:" // Height 3: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "b 148f\n"
+ "134:" // Height 3: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 135f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "b 148f\n"
+ "135:" // Height 3: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 148f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "b 148f\n"
+ "136:" // Height 3: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 138f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #1, 137f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "b 148f\n"
+ "137:" // Height 3: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "b 148f\n"
+ "138:" // Height 3: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 139f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "b 148f\n"
+ "139:" // Height 3: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 148f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "b 148f\n"
+ "140:" // Height 3: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 144f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 142f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #1, 141f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "b 148f\n"
+ "141:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "b 148f\n"
+ "142:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 143f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "b 148f\n"
+ "143:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 148f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "b 148f\n"
+ "144:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 146f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #1, 145f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "b 148f\n"
+ "145:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "b 148f\n"
+ "146:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 147f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "b 148f\n"
+ "147:" // Height 3: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "148:" // Height 3: Partial direct writeback: Done
+ "b 150f\n"
+ "149:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "150:" // Height 3: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 103b\n"
+ "b 302f\n"
+ "151:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 152f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 153f\n"
+ "152:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "153:" // Height 4: Column loop
+ "cbz x14, 154f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 173f\n"
+ "154:" // Height 4: no bias
+ "tbz %x[flags], #0, 172f\n"
+ "cmp x16, #0x20\n"
+ "bge 171f\n"
+ "tbz x16, #4, 162f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "tbz x16, #3, 158f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 156f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #1, 155f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "b 170f\n"
+ "155:" // Height 4: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "b 170f\n"
+ "156:" // Height 4: Partial accumulate: partial_2_24
+ "tbz x16, #1, 157f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "b 170f\n"
+ "157:" // Height 4: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "b 170f\n"
+ "158:" // Height 4: Partial accumulate: partial_4_16
+ "tbz x16, #2, 160f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "tbz x16, #1, 159f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "b 170f\n"
+ "159:" // Height 4: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "b 170f\n"
+ "160:" // Height 4: Partial accumulate: partial_2_16
+ "tbz x16, #1, 161f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "b 170f\n"
+ "161:" // Height 4: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "b 170f\n"
+ "162:" // Height 4: Partial accumulate: partial_8_0
+ "tbz x16, #3, 166f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 164f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #1, 163f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "b 170f\n"
+ "163:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "b 170f\n"
+ "164:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 165f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "b 170f\n"
+ "165:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "b 170f\n"
+ "166:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 168f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "tbz x16, #1, 167f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "b 170f\n"
+ "167:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "b 170f\n"
+ "168:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 169f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "b 170f\n"
+ "169:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "170:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 173f\n"
+ "171:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 173f\n"
+ "172:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "173:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "174:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 175f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 176f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 176f\n"
+ "175:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "176:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "blt 179f\n"
+ "cmp x11, #0x10\n"
+ "blt 178f\n"
+ "177:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "bge 177b\n"
+ "178:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "179:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 181f\n"
+ "180:" // Height 4: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "cbnz x11, 180b\n"
+ "181:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 174b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 182f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "182:" // Height 4: No activation
+ "cmp x16, #0x20\n"
+ "bge 199f\n"
+ "tbz x16, #4, 190f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "tbz x16, #3, 186f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 184f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #1, 183f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "b 198f\n"
+ "183:" // Height 4: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "b 198f\n"
+ "184:" // Height 4: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 185f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "b 198f\n"
+ "185:" // Height 4: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 198f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "b 198f\n"
+ "186:" // Height 4: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 188f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #1, 187f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "b 198f\n"
+ "187:" // Height 4: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "b 198f\n"
+ "188:" // Height 4: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 189f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "b 198f\n"
+ "189:" // Height 4: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 198f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "b 198f\n"
+ "190:" // Height 4: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 194f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 192f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #1, 191f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "b 198f\n"
+ "191:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "b 198f\n"
+ "192:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 193f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "b 198f\n"
+ "193:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 198f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "b 198f\n"
+ "194:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 196f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #1, 195f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "b 198f\n"
+ "195:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "b 198f\n"
+ "196:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 197f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "b 198f\n"
+ "197:" // Height 4: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "198:" // Height 4: Partial direct writeback: Done
+ "b 200f\n"
+ "199:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "200:" // Height 4: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 153b\n"
+ "b 302f\n"
+ "201:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 202f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 203f\n"
+ "202:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "203:" // Height 5: Column loop
+ "cbz x14, 204f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 223f\n"
+ "204:" // Height 5: no bias
+ "tbz %x[flags], #0, 222f\n"
+ "cmp x16, #0x20\n"
+ "bge 221f\n"
+ "tbz x16, #4, 212f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "ld1 { v25.8h }, [x23], #0x10\n"
+ "tbz x16, #3, 208f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "ld1 { v26.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 206f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #1, 205f\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "mov x19, #0x3c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x23]\n"
+ "b 220f\n"
+ "205:" // Height 5: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x23]\n"
+ "b 220f\n"
+ "206:" // Height 5: Partial accumulate: partial_2_24
+ "tbz x16, #1, 207f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "b 220f\n"
+ "207:" // Height 5: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "b 220f\n"
+ "208:" // Height 5: Partial accumulate: partial_4_16
+ "tbz x16, #2, 210f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "tbz x16, #1, 209f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x23]\n"
+ "b 220f\n"
+ "209:" // Height 5: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x23]\n"
+ "b 220f\n"
+ "210:" // Height 5: Partial accumulate: partial_2_16
+ "tbz x16, #1, 211f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x23]\n"
+ "b 220f\n"
+ "211:" // Height 5: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h26, [x23, #0x0]\n"
+ "b 220f\n"
+ "212:" // Height 5: Partial accumulate: partial_8_0
+ "tbz x16, #3, 216f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 214f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #1, 213f\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "mov x19, #0x1c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "b 220f\n"
+ "213:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "b 220f\n"
+ "214:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 215f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "b 220f\n"
+ "215:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "b 220f\n"
+ "216:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 218f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "tbz x16, #1, 217f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "b 220f\n"
+ "217:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "b 220f\n"
+ "218:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 219f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "b 220f\n"
+ "219:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "220:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 223f\n"
+ "221:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 223f\n"
+ "222:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "223:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "224:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 225f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 226f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 226f\n"
+ "225:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "226:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "blt 229f\n"
+ "cmp x11, #0x10\n"
+ "blt 228f\n"
+ "227:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "bge 227b\n"
+ "228:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "229:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 231f\n"
+ "230:" // Height 5: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "cbnz x11, 230b\n"
+ "231:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 224b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 232f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "232:" // Height 5: No activation
+ "cmp x16, #0x20\n"
+ "bge 249f\n"
+ "tbz x16, #4, 240f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v25.8h }, [x23], #0x10\n"
+ "tbz x16, #3, 236f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "st1 { v26.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 234f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #1, 233f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "b 248f\n"
+ "233:" // Height 5: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "b 248f\n"
+ "234:" // Height 5: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 235f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "str s27, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "b 248f\n"
+ "235:" // Height 5: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 248f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "str h27, [x23, #0x0]\n"
+ "b 248f\n"
+ "236:" // Height 5: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 238f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #1, 237f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v26.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "st1 { v26.h }[6], [x23]\n"
+ "b 248f\n"
+ "237:" // Height 5: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "st1 { v26.h }[4], [x23]\n"
+ "b 248f\n"
+ "238:" // Height 5: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 239f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "str s26, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "st1 { v26.h }[2], [x23]\n"
+ "b 248f\n"
+ "239:" // Height 5: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 248f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "str h26, [x23, #0x0]\n"
+ "b 248f\n"
+ "240:" // Height 5: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 244f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 242f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #1, 241f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "st1 { v25.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "st1 { v25.h }[6], [x23]\n"
+ "b 248f\n"
+ "241:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "st1 { v25.h }[4], [x23]\n"
+ "b 248f\n"
+ "242:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 243f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "str s25, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "st1 { v25.h }[2], [x23]\n"
+ "b 248f\n"
+ "243:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 248f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "str h25, [x23, #0x0]\n"
+ "b 248f\n"
+ "244:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 246f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #1, 245f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "b 248f\n"
+ "245:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "b 248f\n"
+ "246:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 247f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "b 248f\n"
+ "247:" // Height 5: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "str h24, [x23, #0x0]\n"
+ "248:" // Height 5: Partial direct writeback: Done
+ "b 250f\n"
+ "249:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "250:" // Height 5: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 203b\n"
+ "b 302f\n"
+ "251:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 252f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 253f\n"
+ "252:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "add x21, x23, x19, LSL #1\n"
+ "add %x[output_ptr], x21, x19, LSL #1\n"
+ "253:" // Height 6: Column loop
+ "cbz x14, 254f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 273f\n"
+ "254:" // Height 6: no bias
+ "tbz %x[flags], #0, 272f\n"
+ "cmp x16, #0x20\n"
+ "bge 271f\n"
+ "tbz x16, #4, 262f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "ld1 { v25.8h }, [x23], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
+ "tbz x16, #3, 258f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "ld1 { v26.8h }, [x23], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 256f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #1, 255f\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "mov x19, #0x3c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v31.h }[6], [x21]\n"
+ "b 270f\n"
+ "255:" // Height 6: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v31.h }[4], [x21]\n"
+ "b 270f\n"
+ "256:" // Height 6: Partial accumulate: partial_2_24
+ "tbz x16, #1, 257f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v31.h }[2], [x21]\n"
+ "b 270f\n"
+ "257:" // Height 6: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
+ "b 270f\n"
+ "258:" // Height 6: Partial accumulate: partial_4_16
+ "tbz x16, #2, 260f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "tbz x16, #1, 259f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v30.h }[6], [x21]\n"
+ "b 270f\n"
+ "259:" // Height 6: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v30.h }[4], [x21]\n"
+ "b 270f\n"
+ "260:" // Height 6: Partial accumulate: partial_2_16
+ "tbz x16, #1, 261f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v30.h }[2], [x21]\n"
+ "b 270f\n"
+ "261:" // Height 6: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h26, [x23, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
+ "b 270f\n"
+ "262:" // Height 6: Partial accumulate: partial_8_0
+ "tbz x16, #3, 266f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 264f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #1, 263f\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
+ "mov x19, #0x1c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v29.h }[6], [x21]\n"
+ "b 270f\n"
+ "263:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v29.h }[4], [x21]\n"
+ "b 270f\n"
+ "264:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 265f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v29.h }[2], [x21]\n"
+ "b 270f\n"
+ "265:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
+ "b 270f\n"
+ "266:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 268f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "tbz x16, #1, 267f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v28.h }[6], [x21]\n"
+ "b 270f\n"
+ "267:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v28.h }[4], [x21]\n"
+ "b 270f\n"
+ "268:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 269f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v28.h }[2], [x21]\n"
+ "b 270f\n"
+ "269:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
+ "270:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 273f\n"
+ "271:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 273f\n"
+ "272:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "273:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "274:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 275f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 276f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 276f\n"
+ "275:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "276:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "blt 279f\n"
+ "cmp x11, #0x10\n"
+ "blt 278f\n"
+ "277:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "bge 277b\n"
+ "278:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "279:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 281f\n"
+ "280:" // Height 6: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "cbnz x11, 280b\n"
+ "281:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 274b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 282f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmin v28.8h, v28.8h, v0.8h\n"
+ "fmin v29.8h, v29.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "fmax v28.8h, v28.8h, v1.8h\n"
+ "fmax v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v0.8h\n"
+ "fmin v31.8h, v31.8h, v0.8h\n"
+ "fmax v30.8h, v30.8h, v1.8h\n"
+ "fmax v31.8h, v31.8h, v1.8h\n"
+ "282:" // Height 6: No activation
+ "cmp x16, #0x20\n"
+ "bge 299f\n"
+ "tbz x16, #4, 290f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v25.8h }, [x23], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
+ "tbz x16, #3, 286f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "st1 { v26.8h }, [x23], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 284f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #1, 283f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x23], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "st1 { v31.h }[6], [x21]\n"
+ "b 298f\n"
+ "283:" // Height 6: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "st1 { v31.h }[4], [x21]\n"
+ "b 298f\n"
+ "284:" // Height 6: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 285f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "str s27, [x23], #0x4\n"
+ "str s31, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "st1 { v31.h }[2], [x21]\n"
+ "b 298f\n"
+ "285:" // Height 6: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 298f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "str h27, [x23, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
+ "b 298f\n"
+ "286:" // Height 6: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 288f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #1, 287f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v26.s }[2], [x23], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "st1 { v26.h }[6], [x23]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "b 298f\n"
+ "287:" // Height 6: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "st1 { v26.h }[4], [x23]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "b 298f\n"
+ "288:" // Height 6: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 289f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "str s26, [x23], #0x4\n"
+ "str s30, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "st1 { v26.h }[2], [x23]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "b 298f\n"
+ "289:" // Height 6: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 298f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "str h26, [x23, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
+ "b 298f\n"
+ "290:" // Height 6: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 294f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 292f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #1, 291f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "st1 { v25.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "st1 { v25.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "b 298f\n"
+ "291:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "st1 { v25.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "b 298f\n"
+ "292:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 293f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "str s25, [x23], #0x4\n"
+ "str s29, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "st1 { v25.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "b 298f\n"
+ "293:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 298f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "str h25, [x23, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
+ "b 298f\n"
+ "294:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 296f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #1, 295f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "st1 { v28.h }[6], [x21]\n"
+ "b 298f\n"
+ "295:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "st1 { v28.h }[4], [x21]\n"
+ "b 298f\n"
+ "296:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 297f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "st1 { v28.h }[2], [x21]\n"
+ "b 298f\n"
+ "297:" // Height 6: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "str h24, [x23, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
+ "298:" // Height 6: Partial direct writeback: Done
+ "b 300f\n"
+ "299:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "300:" // Height 6: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 253b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 302f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 301f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "301:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "302:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
deleted file mode 100644
index 94fcd1064e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v1.d[1], temploadreg1\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v2.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v28.16b, v16.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "mov v29.16b, v17.16b\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "mov v30.16b, v18.16b\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "mov v31.16b, v19.16b\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d7, [a_ptr3]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v7.d[1], temploadreg3\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v1.d[1], temploadreg1\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ins v2.d[1], temploadreg2\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ins v3.d[1], temploadreg3\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d7, [a_ptr3]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v7.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
deleted file mode 100644
index 016bef4b9d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ /dev/null
@@ -1,1802 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v28.16b, v16.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "mov v29.16b, v17.16b\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "mov v30.16b, v18.16b\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "mov v31.16b, v19.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
deleted file mode 100644
index 3f1df76a6a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
+++ /dev/null
@@ -1,1810 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov v27.16b, v19.16b\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov v28.16b, v16.16b\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov v29.16b, v17.16b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov v30.16b, v18.16b\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov v31.16b, v19.16b\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "fmla v28.4s, v8.4s, v7.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v29.4s, v9.4s, v7.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "fmla v30.4s, v10.4s, v7.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "fmla v31.4s, v11.4s, v7.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v28.4s, v8.4s, v7.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v29.4s, v9.4s, v7.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v30.4s, v10.4s, v7.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "fmla v31.4s, v11.4s, v7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "fmla v28.4s, v8.4s, v7.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v29.4s, v9.4s, v7.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "fmla v30.4s, v10.4s, v7.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "fmla v31.4s, v11.4s, v7.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v28.4s, v8.4s, v7.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v29.4s, v9.4s, v7.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v30.4s, v10.4s, v7.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "fmla v31.4s, v11.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
deleted file mode 100644
index 7442d258ec..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ /dev/null
@@ -1,1934 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[4];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=4ul) {
- const long width = std::min((unsigned long)N-x0, 4ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 4);
- float result_buffer[32];
- const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 8); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "ldr q24, [%[biasptr]]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q16, [%[b_ptr0]]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q16, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "ldr q9, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "ldr q10, [a_ptr2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "str q26, [c_ptr2]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "mov v30.16b, v24.16b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "ldr q6, [a_ptr6]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q6, [a_ptr6, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "prfm PSTL1KEEP, [c_ptr6]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "ldr s6, [a_ptr6]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "add a_ptr6, a_ptr6, #0x4\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmax v30.4s, v30.4s, v22.4s\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "fmin v30.4s, v30.4s, v23.4s\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- "str q30, [c_ptr6]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "mov v30.16b, v24.16b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "mov v31.16b, v24.16b\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q6, [a_ptr6]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "ldr q7, [a_ptr7]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q15, [a_ptr7]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q6, [a_ptr6, #-0x10]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q7, [a_ptr7, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v31.4s, v16.4s, v15.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v31.4s, v17.4s, v15.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v31.4s, v18.4s, v15.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "fmla v31.4s, v19.4s, v15.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "prfm PSTL1KEEP, [c_ptr6]\n"
- "prfm PSTL1KEEP, [c_ptr7]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "ldr q15, [a_ptr7]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v31.4s, v16.4s, v15.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v31.4s, v17.4s, v15.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v31.4s, v18.4s, v15.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "fmla v31.4s, v19.4s, v15.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "ldr s6, [a_ptr6]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "add a_ptr6, a_ptr6, #0x4\n"
- "ldr s7, [a_ptr7]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "add a_ptr7, a_ptr7, #0x4\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmax v30.4s, v30.4s, v22.4s\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmax v31.4s, v31.4s, v22.4s\n"
- "str q26, [c_ptr2]\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "fmin v30.4s, v30.4s, v23.4s\n"
- "fmin v31.4s, v31.4s, v23.4s\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- "str q30, [c_ptr6]\n"
- "str q31, [c_ptr7]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 8); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 4 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4147ab60dc..e0c61e4113 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,44 +10,49 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
#include "../performance_parameters.hpp"
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_6x16( ARGLIST );
-class hybrid_fp32_mla_16x4
+class cls_a64_hybrid_fp32_mla_6x16
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -65,47 +70,33 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 2.866 };
+ return { 2.00 };
case CPUModel::A53:
- return { 1.419 };
+ return { 1.43 };
case CPUModel::A73:
- return { 2.551 };
+ return { 2.56 };
default:
- return { 6.25 };
+ return { 6.26 };
}
}
- StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_fp32_mla_16x4;
+ kern_type kernel=a64_hybrid_fp32_mla_6x16;
- hybrid_fp32_mla_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_fp32_mla_16x4_a55;
- } else if (ci->get_cpu_model() == CPUModel::X1) {
- kernel = a64_hybrid_fp32_mla_16x4_x1;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
new file mode 100644
index 0000000000..884e8986c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -0,0 +1,3430 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 171f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 137f\n"
+ "beq 103f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 15f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 14f\n"
+ "cmp x16, #0x10\n"
+ "bge 13f\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 12f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 12f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 7f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 12f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 12f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 10f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 12f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 12f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 11f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 12f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "12:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 15f\n"
+ "13:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 15f\n"
+ "14:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "15:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "16:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 18f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 18f\n"
+ "17:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "18:" // Height 1: input setup done
+ "cmp x11, #0x4\n"
+ "blt 21f\n"
+ "cmp x11, #0x8\n"
+ "blt 20f\n"
+ "19:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x8\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "bge 19b\n"
+ "20:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "21:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 23f\n"
+ "22:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "add x15, x15, #0x40\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "cbnz x11, 22b\n"
+ "23:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 16b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 24f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "24:" // Height 1: No activation
+ "cmp x16, #0x10\n"
+ "bge 33f\n"
+ "tbz x16, #3, 28f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 26f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 25f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 32f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 27f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 32f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 30f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 29f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 32f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 31f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 3b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 36f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "37:" // Height 2: Column loop
+ "cbz x14, 38f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 49f\n"
+ "38:" // Height 2: no bias
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x16, #0x10\n"
+ "bge 47f\n"
+ "tbz x16, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x4\n"
+ "blt 55f\n"
+ "cmp x11, #0x8\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x8\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "cbnz x11, 56b\n"
+ "57:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 58f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "58:" // Height 2: No activation
+ "cmp x16, #0x10\n"
+ "bge 67f\n"
+ "tbz x16, #3, 62f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 60f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 59f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 66f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 61f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 66f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 63f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 66f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 65f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
+ "b 68f\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "68:" // Height 2: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 37b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 70f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 71f\n"
+ "70:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "71:" // Height 3: Column loop
+ "cbz x14, 72f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 83f\n"
+ "72:" // Height 3: no bias
+ "tbz %x[flags], #0, 82f\n"
+ "cmp x16, #0x10\n"
+ "bge 81f\n"
+ "tbz x16, #3, 76f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 74f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 73f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 80f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 80f\n"
+ "74:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 75f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 80f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 80f\n"
+ "76:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 78f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 77f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 80f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 80f\n"
+ "78:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 79f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 80f\n"
+ "79:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "80:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 83f\n"
+ "81:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 83f\n"
+ "82:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "83:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "84:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 86f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 86f\n"
+ "85:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "86:" // Height 3: input setup done
+ "cmp x11, #0x4\n"
+ "blt 89f\n"
+ "cmp x11, #0x8\n"
+ "blt 88f\n"
+ "87:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "bge 87b\n"
+ "88:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "89:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 91f\n"
+ "90:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "cbnz x11, 90b\n"
+ "91:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 84b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 92f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "92:" // Height 3: No activation
+ "cmp x16, #0x10\n"
+ "bge 101f\n"
+ "tbz x16, #3, 96f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 94f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 93f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 100f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 95f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 100f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 98f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 97f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 100f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 99f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "102:" // Height 3: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 71b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 104f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 105f\n"
+ "104:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "105:" // Height 4: Column loop
+ "cbz x14, 106f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 117f\n"
+ "106:" // Height 4: no bias
+ "tbz %x[flags], #0, 116f\n"
+ "cmp x16, #0x10\n"
+ "bge 115f\n"
+ "tbz x16, #3, 110f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 108f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 107f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 114f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 114f\n"
+ "108:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 109f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 114f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 114f\n"
+ "110:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 111f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 114f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 114f\n"
+ "112:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 113f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 114f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "114:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 117f\n"
+ "115:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 117f\n"
+ "116:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "117:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "118:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 119f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 120f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 120f\n"
+ "119:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "120:" // Height 4: input setup done
+ "cmp x11, #0x4\n"
+ "blt 123f\n"
+ "cmp x11, #0x8\n"
+ "blt 122f\n"
+ "121:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "bge 121b\n"
+ "122:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "123:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 125f\n"
+ "124:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "cbnz x11, 124b\n"
+ "125:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 118b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 126f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "126:" // Height 4: No activation
+ "cmp x16, #0x10\n"
+ "bge 135f\n"
+ "tbz x16, #3, 130f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 128f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 127f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 134f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 129f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 134f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 132f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 131f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 134f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 133f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "136:" // Height 4: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 105b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 138f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 139f\n"
+ "138:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "139:" // Height 5: Column loop
+ "cbz x14, 140f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 151f\n"
+ "140:" // Height 5: no bias
+ "tbz %x[flags], #0, 150f\n"
+ "cmp x16, #0x10\n"
+ "bge 149f\n"
+ "tbz x16, #3, 144f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 142f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 141f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 148f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 148f\n"
+ "142:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 143f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 148f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 148f\n"
+ "144:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 146f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 145f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 148f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 148f\n"
+ "146:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 147f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 148f\n"
+ "147:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "148:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 151f\n"
+ "149:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 151f\n"
+ "150:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "151:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "152:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 153f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 154f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 154f\n"
+ "153:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "154:" // Height 5: input setup done
+ "cmp x11, #0x4\n"
+ "blt 157f\n"
+ "cmp x11, #0x8\n"
+ "blt 156f\n"
+ "155:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "bge 155b\n"
+ "156:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "157:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 159f\n"
+ "158:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "cbnz x11, 158b\n"
+ "159:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 152b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 160f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "160:" // Height 5: No activation
+ "cmp x16, #0x10\n"
+ "bge 169f\n"
+ "tbz x16, #3, 164f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 162f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 161f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 168f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 163f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 168f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 166f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 165f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 168f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 167f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "170:" // Height 5: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 139b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 172f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 173f\n"
+ "172:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "173:" // Height 6: Column loop
+ "cbz x14, 174f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 185f\n"
+ "174:" // Height 6: no bias
+ "tbz %x[flags], #0, 184f\n"
+ "cmp x16, #0x10\n"
+ "bge 183f\n"
+ "tbz x16, #3, 178f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 176f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 175f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 182f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 182f\n"
+ "176:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 177f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 182f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 182f\n"
+ "178:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 180f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 179f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 182f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 182f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 181f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 182f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "182:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 185f\n"
+ "183:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 185f\n"
+ "184:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "185:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "186:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 187f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 188f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 188f\n"
+ "187:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "188:" // Height 6: input setup done
+ "cmp x11, #0x4\n"
+ "blt 191f\n"
+ "cmp x11, #0x8\n"
+ "blt 190f\n"
+ "189:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "bge 189b\n"
+ "190:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "191:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 193f\n"
+ "192:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "cbnz x11, 192b\n"
+ "193:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 186b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 194f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "fmax v28.4s, v28.4s, v1.4s\n"
+ "fmax v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
+ "fmax v30.4s, v30.4s, v1.4s\n"
+ "fmax v31.4s, v31.4s, v1.4s\n"
+ "194:" // Height 6: No activation
+ "cmp x16, #0x10\n"
+ "bge 203f\n"
+ "tbz x16, #3, 198f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 196f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 195f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 202f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 197f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 202f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 200f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 199f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 202f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 201f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "204:" // Height 6: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 173b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 206f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 205f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "205:" // Update direct input
+ "mov x19, #0x18\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "206:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
new file mode 100644
index 0000000000..043d0643f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_8x4( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_8x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 8;
+ }
+
+ static unsigned int out_width()
+ {
+ return 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp32_mla_8x4;
+
+ cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
new file mode 100644
index 0000000000..3ab6cad368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -0,0 +1,2195 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_8x4 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x8\n"
+ "bge 155f\n"
+ "cmp %x[M], #0x6\n"
+ "bgt 133f\n"
+ "beq 111f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 89f\n"
+ "beq 67f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 45f\n"
+ "beq 23f\n"
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x8, 4f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "add x8, x8, #0x10\n"
+ "b 9f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 8f\n"
+ "cmp x6, #0x4\n"
+ "bge 7f\n"
+ "tbz x6, #1, 5f\n"
+ "ldr d24, [x17], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 6f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "b 6f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "6:" // Height 1: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "b 9f\n"
+ "7:" // Height 1: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "b 9f\n"
+ "8:" // Height 1: no accumulate
+ "movi v24.16b, #0x0\n"
+ "9:" // Height 1: setup done
+ "mov x16, #0x0\n"
+ "10:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 11f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "cbnz x16, 12f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "b 12f\n"
+ "11:" // Height 1: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "12:" // Height 1: input setup done
+ "cmp x15, #0x4\n"
+ "blt 15f\n"
+ "cmp x15, #0x8\n"
+ "blt 14f\n"
+ "13:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "bge 13b\n"
+ "14:" // Height 1: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "15:" // Height 1: Multiply loop: Main loop skip
+ "cbz x15, 17f\n"
+ "16:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "add x7, x7, #0x10\n"
+ "cbnz x15, 16b\n"
+ "17:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 10b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "tbz %x[flags], #1, 18f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "18:" // Height 1: No activation
+ "cmp x6, #0x4\n"
+ "bge 21f\n"
+ "tbz x6, #1, 19f\n"
+ "str d24, [x17], #0x8\n"
+ "tbz x6, #0, 20f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "b 20f\n"
+ "19:" // Height 1: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "20:" // Height 1: Partial direct writeback: Done
+ "b 22f\n"
+ "21:" // Height 1: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "22:" // Height 1: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 3b\n"
+ "b 178f\n"
+ "23:" // Height 2
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 24f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 25f\n"
+ "24:" // Height 2: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "25:" // Height 2: Column loop
+ "cbz x8, 26f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "b 31f\n"
+ "26:" // Height 2: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "cmp x6, #0x4\n"
+ "bge 29f\n"
+ "tbz x6, #1, 27f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 28f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "b 28f\n"
+ "27:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "28:" // Height 2: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "b 31f\n"
+ "29:" // Height 2: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "b 31f\n"
+ "30:" // Height 2: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "31:" // Height 2: setup done
+ "mov x16, #0x0\n"
+ "32:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "cbnz x16, 34f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "34:" // Height 2: input setup done
+ "cmp x15, #0x4\n"
+ "blt 37f\n"
+ "cmp x15, #0x8\n"
+ "blt 36f\n"
+ "35:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "bge 35b\n"
+ "36:" // Height 2: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "37:" // Height 2: Multiply loop: Main loop skip
+ "cbz x15, 39f\n"
+ "38:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "cbnz x15, 38b\n"
+ "39:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 32b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 40f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "40:" // Height 2: No activation
+ "cmp x6, #0x4\n"
+ "bge 43f\n"
+ "tbz x6, #1, 41f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "tbz x6, #0, 42f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "b 42f\n"
+ "41:" // Height 2: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "42:" // Height 2: Partial direct writeback: Done
+ "b 44f\n"
+ "43:" // Height 2: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "44:" // Height 2: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 25b\n"
+ "b 178f\n"
+ "45:" // Height 3
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 46f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "add x11, x11, x19, LSL #2\n"
+ "b 47f\n"
+ "46:" // Height 3: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "47:" // Height 3: Column loop
+ "cbz x8, 48f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "b 53f\n"
+ "48:" // Height 3: no bias
+ "tbz %x[flags], #0, 52f\n"
+ "cmp x6, #0x4\n"
+ "bge 51f\n"
+ "tbz x6, #1, 49f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 50f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "b 50f\n"
+ "49:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "50:" // Height 3: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "b 53f\n"
+ "51:" // Height 3: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "b 53f\n"
+ "52:" // Height 3: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "53:" // Height 3: setup done
+ "mov x16, #0x0\n"
+ "54:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "cbnz x16, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 56f\n"
+ "55:" // Height 3: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "56:" // Height 3: input setup done
+ "cmp x15, #0x4\n"
+ "blt 59f\n"
+ "cmp x15, #0x8\n"
+ "blt 58f\n"
+ "57:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "bge 57b\n"
+ "58:" // Height 3: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "59:" // Height 3: Multiply loop: Main loop skip
+ "cbz x15, 61f\n"
+ "60:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "cbnz x15, 60b\n"
+ "61:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "tbz %x[flags], #1, 62f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "62:" // Height 3: No activation
+ "cmp x6, #0x4\n"
+ "bge 65f\n"
+ "tbz x6, #1, 63f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "tbz x6, #0, 64f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "b 64f\n"
+ "63:" // Height 3: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "64:" // Height 3: Partial direct writeback: Done
+ "b 66f\n"
+ "65:" // Height 3: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "66:" // Height 3: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 47b\n"
+ "b 178f\n"
+ "67:" // Height 4
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 68f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 69f\n"
+ "68:" // Height 4: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "69:" // Height 4: Column loop
+ "cbz x8, 70f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "b 75f\n"
+ "70:" // Height 4: no bias
+ "tbz %x[flags], #0, 74f\n"
+ "cmp x6, #0x4\n"
+ "bge 73f\n"
+ "tbz x6, #1, 71f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 72f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "b 72f\n"
+ "71:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "72:" // Height 4: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "b 75f\n"
+ "73:" // Height 4: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "b 75f\n"
+ "74:" // Height 4: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "75:" // Height 4: setup done
+ "mov x16, #0x0\n"
+ "76:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 77f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "cbnz x16, 78f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 78f\n"
+ "77:" // Height 4: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "78:" // Height 4: input setup done
+ "cmp x15, #0x4\n"
+ "blt 81f\n"
+ "cmp x15, #0x8\n"
+ "blt 80f\n"
+ "79:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "bge 79b\n"
+ "80:" // Height 4: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "81:" // Height 4: Multiply loop: Main loop skip
+ "cbz x15, 83f\n"
+ "82:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "cbnz x15, 82b\n"
+ "83:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 76b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 84f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "84:" // Height 4: No activation
+ "cmp x6, #0x4\n"
+ "bge 87f\n"
+ "tbz x6, #1, 85f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "tbz x6, #0, 86f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "b 86f\n"
+ "85:" // Height 4: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "86:" // Height 4: Partial direct writeback: Done
+ "b 88f\n"
+ "87:" // Height 4: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "88:" // Height 4: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 69b\n"
+ "b 178f\n"
+ "89:" // Height 5
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 90f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 91f\n"
+ "90:" // Height 5: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "91:" // Height 5: Column loop
+ "cbz x8, 92f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "b 97f\n"
+ "92:" // Height 5: no bias
+ "tbz %x[flags], #0, 96f\n"
+ "cmp x6, #0x4\n"
+ "bge 95f\n"
+ "tbz x6, #1, 93f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 94f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "b 94f\n"
+ "93:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "94:" // Height 5: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 97f\n"
+ "95:" // Height 5: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "b 97f\n"
+ "96:" // Height 5: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "97:" // Height 5: setup done
+ "mov x16, #0x0\n"
+ "98:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "cbnz x16, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 100f\n"
+ "99:" // Height 5: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "100:" // Height 5: input setup done
+ "cmp x15, #0x4\n"
+ "blt 103f\n"
+ "cmp x15, #0x8\n"
+ "blt 102f\n"
+ "101:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "bge 101b\n"
+ "102:" // Height 5: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "103:" // Height 5: Multiply loop: Main loop skip
+ "cbz x15, 105f\n"
+ "104:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "cbnz x15, 104b\n"
+ "105:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 106f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "106:" // Height 5: No activation
+ "cmp x6, #0x4\n"
+ "bge 109f\n"
+ "tbz x6, #1, 107f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "tbz x6, #0, 108f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "b 108f\n"
+ "107:" // Height 5: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "108:" // Height 5: Partial direct writeback: Done
+ "b 110f\n"
+ "109:" // Height 5: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "110:" // Height 5: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 91b\n"
+ "b 178f\n"
+ "111:" // Height 6
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 112f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 113f\n"
+ "112:" // Height 6: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "113:" // Height 6: Column loop
+ "cbz x8, 114f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "b 119f\n"
+ "114:" // Height 6: no bias
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x6, #0x4\n"
+ "bge 117f\n"
+ "tbz x6, #1, 115f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 116f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "116:" // Height 6: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 6: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "b 119f\n"
+ "118:" // Height 6: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "119:" // Height 6: setup done
+ "mov x16, #0x0\n"
+ "120:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "cbnz x16, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 122f\n"
+ "121:" // Height 6: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "122:" // Height 6: input setup done
+ "cmp x15, #0x4\n"
+ "blt 125f\n"
+ "cmp x15, #0x8\n"
+ "blt 124f\n"
+ "123:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "bge 123b\n"
+ "124:" // Height 6: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "125:" // Height 6: Multiply loop: Main loop skip
+ "cbz x15, 127f\n"
+ "126:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "cbnz x15, 126b\n"
+ "127:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 128f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "128:" // Height 6: No activation
+ "cmp x6, #0x4\n"
+ "bge 131f\n"
+ "tbz x6, #1, 129f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "tbz x6, #0, 130f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "b 130f\n"
+ "129:" // Height 6: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "130:" // Height 6: Partial direct writeback: Done
+ "b 132f\n"
+ "131:" // Height 6: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "132:" // Height 6: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 113b\n"
+ "b 178f\n"
+ "133:" // Height 7
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 134f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 135f\n"
+ "134:" // Height 7: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "135:" // Height 7: Column loop
+ "cbz x8, 136f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "b 141f\n"
+ "136:" // Height 7: no bias
+ "tbz %x[flags], #0, 140f\n"
+ "cmp x6, #0x4\n"
+ "bge 139f\n"
+ "tbz x6, #1, 137f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "ldr d30, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 138f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "ld1 { v30.s }[2], [x23]\n"
+ "b 138f\n"
+ "137:" // Height 7: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "ldr s30, [x23, #0x0]\n"
+ "138:" // Height 7: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 141f\n"
+ "139:" // Height 7: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "ldr q30, [x23, #0x0]\n"
+ "b 141f\n"
+ "140:" // Height 7: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "141:" // Height 7: setup done
+ "mov x16, #0x0\n"
+ "142:" // Height 7: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 143f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "cbnz x16, 144f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 144f\n"
+ "143:" // Height 7: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "144:" // Height 7: input setup done
+ "cmp x15, #0x4\n"
+ "blt 147f\n"
+ "cmp x15, #0x8\n"
+ "blt 146f\n"
+ "145:" // Height 7: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "bge 145b\n"
+ "146:" // Height 7: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v30.4s, v13.4s, v6.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v30.4s, v14.4s, v6.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "fmla v30.4s, v15.4s, v6.s[3]\n"
+ "147:" // Height 7: Multiply loop: Main loop skip
+ "cbz x15, 149f\n"
+ "148:" // Height 7: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "fmla v30.4s, v16.4s, v6.s[0]\n"
+ "cbnz x15, 148b\n"
+ "149:" // Height 7: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 142b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 150f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "150:" // Height 7: No activation
+ "cmp x6, #0x4\n"
+ "bge 153f\n"
+ "tbz x6, #1, 151f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "str d30, [x23], #0x8\n"
+ "tbz x6, #0, 152f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x23]\n"
+ "b 152f\n"
+ "151:" // Height 7: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "str s30, [x23, #0x0]\n"
+ "152:" // Height 7: Partial direct writeback: Done
+ "b 154f\n"
+ "153:" // Height 7: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "str q30, [x23, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "154:" // Height 7: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 135b\n"
+ "b 178f\n"
+ "155:" // Height 8
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 156f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "ldr x21, [%x[output_ptr], #0x38]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 157f\n"
+ "156:" // Height 8: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "157:" // Height 8: Column loop
+ "cbz x8, 158f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ "b 163f\n"
+ "158:" // Height 8: no bias
+ "tbz %x[flags], #0, 162f\n"
+ "cmp x6, #0x4\n"
+ "bge 161f\n"
+ "tbz x6, #1, 159f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "ldr d30, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 160f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "ld1 { v30.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 160f\n"
+ "159:" // Height 8: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "ldr s30, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "160:" // Height 8: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 163f\n"
+ "161:" // Height 8: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "ldr q30, [x23, #0x0]\n"
+ "ldr q31, [x21, #0x0]\n"
+ "b 163f\n"
+ "162:" // Height 8: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "163:" // Height 8: setup done
+ "mov x16, #0x0\n"
+ "164:" // Height 8: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 165f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x38]\n"
+ "cbnz x16, 166f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 166f\n"
+ "165:" // Height 8: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "166:" // Height 8: input setup done
+ "cmp x15, #0x4\n"
+ "blt 169f\n"
+ "cmp x15, #0x8\n"
+ "blt 168f\n"
+ "167:" // Height 8: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v8.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v31.4s, v9.4s, v7.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v31.4s, v10.4s, v7.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "bge 167b\n"
+ "168:" // Height 8: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v12.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v30.4s, v13.4s, v6.s[1]\n"
+ "fmla v31.4s, v13.4s, v7.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v30.4s, v14.4s, v6.s[2]\n"
+ "fmla v31.4s, v14.4s, v7.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "fmla v30.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "169:" // Height 8: Multiply loop: Main loop skip
+ "cbz x15, 171f\n"
+ "170:" // Height 8: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr s7, [x20], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "fmla v30.4s, v16.4s, v6.s[0]\n"
+ "fmla v31.4s, v16.4s, v7.s[0]\n"
+ "cbnz x15, 170b\n"
+ "171:" // Height 8: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 164b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 172f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v17.4s\n"
+ "172:" // Height 8: No activation
+ "cmp x6, #0x4\n"
+ "bge 175f\n"
+ "tbz x6, #1, 173f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "str d30, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x6, #0, 174f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 174f\n"
+ "173:" // Height 8: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "str s30, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "174:" // Height 8: Partial direct writeback: Done
+ "b 176f\n"
+ "175:" // Height 8: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "str q30, [x23, #0x0]\n"
+ "str q31, [x21, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "176:" // Height 8: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 157b\n"
+ "subs %x[M], %x[M], #0x8\n"
+ "beq 178f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 177f\n"
+ "add x20, x20, #0x8\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "177:" // Update direct input
+ "mov x19, #0x20\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "178:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index a23101a7ce..4bb7a1e0eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,38 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
-#include <cstdint>
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_hybrid_s8qa_dot_4x16( ARGLIST );
-class hybrid_s8s32_dot_16x4
+class cls_a64_hybrid_s8qa_dot_4x16
{
public:
typedef int8_t operand_type;
- typedef int32_t result_type;
+ typedef int8_t result_type;
- typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+ kern_type kernel=a64_hybrid_s8qa_dot_4x16;
- hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_s8s32_dot_16x4_a55;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..3fb365bc1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_dot_4x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 94f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 63f\n"
+ "beq 32f\n"
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 12f\n"
+ "cmp x27, #0x20\n"
+ "blt 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x11, #0x80]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 11f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "12:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 19f\n"
+ "cmp x27, #0x4\n"
+ "blt 15f\n"
+ "13:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "tbnz %x[flags], #31, 14f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "14:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ "bge 13b\n"
+ "cbz x27, 19f\n"
+ "15:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 16f\n"
+ "ldr h0, [x26], #0x2\n"
+ "tbz x27, #0, 17f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "b 17f\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "17:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 18f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "18:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ "19:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 20f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "neg v1.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "20:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add x10, x10, #0x40\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 21f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "21:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 30f\n"
+ "tbz x12, #3, 25f\n"
+ "str d16, [x9], #0x8\n"
+ "tbz x12, #2, 23f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "tbz x12, #1, 22f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "b 29f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "b 29f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 24f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "b 29f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "b 29f\n"
+ "25:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 27f\n"
+ "str s16, [x9], #0x4\n"
+ "tbz x12, #1, 26f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "b 29f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "b 29f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 28f\n"
+ "str h16, [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "b 29f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "29:" // Height 1: Partial direct writeback: Done
+ "b 31f\n"
+ "30:" // Height 1: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "31:" // Height 1: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 3b\n"
+ "b 126f\n"
+ "32:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 33f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "34:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "35:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "36:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 38f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "38:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 43f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 42f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "42:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "43:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 50f\n"
+ "cmp x27, #0x4\n"
+ "blt 46f\n"
+ "44:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "tbnz %x[flags], #31, 45f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "45:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ "bge 44b\n"
+ "cbz x27, 50f\n"
+ "46:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 47f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x27, #0, 48f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 48f\n"
+ "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "48:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 49f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "49:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ "50:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 36b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 51f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v2.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "51:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 52f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "52:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 61f\n"
+ "tbz x12, #3, 56f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x12, #2, 54f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x12, #1, 53f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 60f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 60f\n"
+ "54:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 55f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 60f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 60f\n"
+ "56:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 58f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x12, #1, 57f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 60f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 60f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 59f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 60f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "60:" // Height 2: Partial direct writeback: Done
+ "b 62f\n"
+ "61:" // Height 2: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "62:" // Height 2: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 34b\n"
+ "b 126f\n"
+ "63:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 64f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 65f\n"
+ "64:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "65:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "66:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "67:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 68f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 69f\n"
+ "68:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "69:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 74f\n"
+ "cmp x27, #0x20\n"
+ "blt 72f\n"
+ "70:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 71f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "71:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bge 70b\n"
+ "72:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "74:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 81f\n"
+ "cmp x27, #0x4\n"
+ "blt 77f\n"
+ "75:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 76f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "76:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ "bge 75b\n"
+ "cbz x27, 81f\n"
+ "77:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 78f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "tbz x27, #0, 79f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "79:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 80f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "80:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
+ "81:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 67b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 82f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "82:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "83:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 92f\n"
+ "tbz x12, #3, 87f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x12, #2, 85f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x12, #1, 84f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 91f\n"
+ "84:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 91f\n"
+ "85:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 86f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 91f\n"
+ "86:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 91f\n"
+ "87:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 89f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x12, #1, 88f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 91f\n"
+ "88:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 91f\n"
+ "89:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 90f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "91:" // Height 3: Partial direct writeback: Done
+ "b 93f\n"
+ "92:" // Height 3: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "93:" // Height 3: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 65b\n"
+ "b 126f\n"
+ "94:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 95f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "96:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "97:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "98:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 100f\n"
+ "99:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "100:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 105f\n"
+ "cmp x27, #0x20\n"
+ "blt 103f\n"
+ "101:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 102f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "102:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bge 101b\n"
+ "103:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15c // sdot v28.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09d // sdot v29.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0df // sdot v31.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fc // sdot v28.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91d // sdot v29.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4f83e93e // sdot v30.4s, v9.16b, v3.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4f83e95f // sdot v31.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89c // sdot v28.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bd // sdot v29.4s, v5.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8de // sdot v30.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8ff // sdot v31.4s, v7.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 104f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "104:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "105:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 112f\n"
+ "cmp x27, #0x4\n"
+ "blt 108f\n"
+ "106:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
+ "bge 106b\n"
+ "cbz x27, 112f\n"
+ "108:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 109f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x27, #0, 110f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 110f\n"
+ "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "110:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 111f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "111:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f83e0bc // sdot v28.4s, v5.16b, v3.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0fe // sdot v30.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f83e11f // sdot v31.4s, v8.16b, v3.4b[0]\n"
+ "112:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 113f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "113:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 114f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "114:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 123f\n"
+ "tbz x12, #3, 118f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x12, #2, 116f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x12, #1, 115f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 122f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 122f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 117f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 122f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 122f\n"
+ "118:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 120f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x12, #1, 119f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 122f\n"
+ "119:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 122f\n"
+ "120:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 121f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 122f\n"
+ "121:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "122:" // Height 4: Partial direct writeback: Done
+ "b 124f\n"
+ "123:" // Height 4: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "124:" // Height 4: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 96b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 126f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "125:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "126:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
new file mode 100644
index 0000000000..6d4f3b2efe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8qs_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_dot_6x16
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8qs_dot_6x16;
+
+ cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..0e98ab8347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -0,0 +1,3613 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ struct KernelArgs {
+ const int32_t *multiplier_ptr = {};
+ const int32_t *shift_ptr = {};
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->per_channel_requant) {
+ flags |= 0x10;
+ ka.multiplier_ptr=qp->per_channel_muls + col_base;
+ ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+ }
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 141f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 113f\n"
+ "beq 85f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 57f\n"
+ "beq 29f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 10f\n"
+ "cmp x11, #0x20\n"
+ "blt 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "10:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 15f\n"
+ "cmp x11, #0x4\n"
+ "blt 12f\n"
+ "11:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 11b\n"
+ "cbz x11, 15f\n"
+ "12:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 13f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 14f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 14f\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "14:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "15:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "tbz %x[flags], #4, 16f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 17f\n"
+ "16:" // Height 1: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "17:" // Height 1: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "tbz %x[flags], #5, 18f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "18:" // Height 1: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "bge 27f\n"
+ "tbz x15, #3, 22f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #2, 20f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x15, #1, 19f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "b 26f\n"
+ "19:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "b 26f\n"
+ "20:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 21f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "b 26f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "b 26f\n"
+ "22:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 24f\n"
+ "str s8, [x13], #0x4\n"
+ "tbz x15, #1, 23f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "b 26f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "b 26f\n"
+ "24:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 25f\n"
+ "str h8, [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "b 26f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "26:" // Height 1: Partial direct writeback: Done
+ "b 28f\n"
+ "27:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "28:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 170f\n"
+ "29:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "b 31f\n"
+ "30:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "31:" // Height 2: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "32:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "33:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 34f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 35f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 35f\n"
+ "34:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "35:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 38f\n"
+ "cmp x11, #0x20\n"
+ "blt 37f\n"
+ "36:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 36b\n"
+ "37:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "38:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 43f\n"
+ "cmp x11, #0x4\n"
+ "blt 40f\n"
+ "39:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 39b\n"
+ "cbz x11, 43f\n"
+ "40:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 41f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 42f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 42f\n"
+ "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "42:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "43:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 33b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "tbz %x[flags], #4, 44f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 45f\n"
+ "44:" // Height 2: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "45:" // Height 2: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "tbz %x[flags], #5, 46f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "46:" // Height 2: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "bge 55f\n"
+ "tbz x15, #3, 50f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #2, 48f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x15, #1, 47f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "b 54f\n"
+ "47:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "b 54f\n"
+ "48:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 49f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "b 54f\n"
+ "49:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "b 54f\n"
+ "50:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 52f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "tbz x15, #1, 51f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "b 54f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "b 54f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 53f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "b 54f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "54:" // Height 2: Partial direct writeback: Done
+ "b 56f\n"
+ "55:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "56:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 31b\n"
+ "b 170f\n"
+ "57:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "add x27, x27, x19\n"
+ "b 59f\n"
+ "58:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "59:" // Height 3: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "60:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "61:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 62f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 63f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 63f\n"
+ "62:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "63:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 66f\n"
+ "cmp x11, #0x20\n"
+ "blt 65f\n"
+ "64:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 64b\n"
+ "65:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "66:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 71f\n"
+ "cmp x11, #0x4\n"
+ "blt 68f\n"
+ "67:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 67b\n"
+ "cbz x11, 71f\n"
+ "68:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 69f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 70f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 70f\n"
+ "69:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "70:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "71:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 61b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "tbz %x[flags], #4, 72f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 73f\n"
+ "72:" // Height 3: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "73:" // Height 3: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "tbz %x[flags], #5, 74f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "74:" // Height 3: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 83f\n"
+ "tbz x15, #3, 78f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #2, 76f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x15, #1, 75f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "b 82f\n"
+ "75:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "b 82f\n"
+ "76:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 77f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "b 82f\n"
+ "77:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "b 82f\n"
+ "78:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 80f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "tbz x15, #1, 79f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "b 82f\n"
+ "79:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "b 82f\n"
+ "80:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 81f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "b 82f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "82:" // Height 3: Partial direct writeback: Done
+ "b 84f\n"
+ "83:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "84:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 59b\n"
+ "b 170f\n"
+ "85:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 86f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "b 87f\n"
+ "86:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "87:" // Height 4: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "88:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "89:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 90f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 91f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 91f\n"
+ "90:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "91:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 94f\n"
+ "cmp x11, #0x20\n"
+ "blt 93f\n"
+ "92:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 92b\n"
+ "93:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "94:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 99f\n"
+ "cmp x11, #0x4\n"
+ "blt 96f\n"
+ "95:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 95b\n"
+ "cbz x11, 99f\n"
+ "96:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 97f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 98f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 98f\n"
+ "97:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "98:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "99:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 89b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "tbz %x[flags], #4, 100f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 101f\n"
+ "100:" // Height 4: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "101:" // Height 4: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "tbz %x[flags], #5, 102f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "102:" // Height 4: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 111f\n"
+ "tbz x15, #3, 106f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #2, 104f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x15, #1, 103f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 110f\n"
+ "103:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 110f\n"
+ "104:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 105f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 110f\n"
+ "105:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 110f\n"
+ "106:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 108f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x15, #1, 107f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 110f\n"
+ "107:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 110f\n"
+ "108:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 109f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 110f\n"
+ "109:" // Height 4: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "110:" // Height 4: Partial direct writeback: Done
+ "b 112f\n"
+ "111:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "112:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 87b\n"
+ "b 170f\n"
+ "113:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 114f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 115f\n"
+ "114:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "115:" // Height 5: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "116:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "117:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 118f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 119f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 119f\n"
+ "118:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "119:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 122f\n"
+ "cmp x11, #0x20\n"
+ "blt 121f\n"
+ "120:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 120b\n"
+ "121:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "122:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 127f\n"
+ "cmp x11, #0x4\n"
+ "blt 124f\n"
+ "123:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 123b\n"
+ "cbz x11, 127f\n"
+ "124:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 125f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 126f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 126f\n"
+ "125:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "126:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "127:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 117b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "tbz %x[flags], #4, 128f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 129f\n"
+ "128:" // Height 5: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "129:" // Height 5: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "tbz %x[flags], #5, 130f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "130:" // Height 5: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x15, #1, 131f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 138f\n"
+ "131:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 138f\n"
+ "132:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 138f\n"
+ "133:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 138f\n"
+ "134:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x15, #1, 135f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 138f\n"
+ "135:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 138f\n"
+ "136:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 138f\n"
+ "137:" // Height 5: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "138:" // Height 5: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "140:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 115b\n"
+ "b 170f\n"
+ "141:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 143f\n"
+ "142:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "143:" // Height 6: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "144:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "145:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 146f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 147f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 147f\n"
+ "146:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "147:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 150f\n"
+ "cmp x11, #0x20\n"
+ "blt 149f\n"
+ "148:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 148b\n"
+ "149:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "150:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 155f\n"
+ "cmp x11, #0x4\n"
+ "blt 152f\n"
+ "151:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 151b\n"
+ "cbz x11, 155f\n"
+ "152:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 153f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 154f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 154f\n"
+ "153:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "154:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "155:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 145b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x16, x16, #0x40\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "tbz %x[flags], #4, 156f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 157f\n"
+ "156:" // Height 6: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "157:" // Height 6: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v5.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v7.4s\n"
+ "tbz %x[flags], #5, 158f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "and v5.16b, v29.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v6.16b, v30.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "and v7.16b, v31.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v29.4s, v29.4s, v5.4s\n"
+ "sqadd v30.4s, v30.4s, v6.4s\n"
+ "sqadd v31.4s, v31.4s, v7.4s\n"
+ "158:" // Height 6: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v3.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 167f\n"
+ "tbz x15, #3, 162f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #2, 160f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x15, #1, 159f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 166f\n"
+ "159:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 166f\n"
+ "160:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 161f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 166f\n"
+ "161:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 166f\n"
+ "162:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 164f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x15, #1, 163f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 166f\n"
+ "163:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 166f\n"
+ "164:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 165f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 166f\n"
+ "165:" // Height 6: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "166:" // Height 6: Partial direct writeback: Done
+ "b 168f\n"
+ "167:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "168:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 170f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 169f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "169:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "170:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
deleted file mode 100644
index 4a7cdc59a7..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- int32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
- int32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
deleted file mode 100644
index da39a32690..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- int32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
- int32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
new file mode 100644
index 0000000000..16a6f9213a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int32_t>, \
+ const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_dot_6x16
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8s32_dot_6x16;
+
+ cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3257986410
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+ const int32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 176f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 141f\n"
+ "beq 106f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 71f\n"
+ "beq 36f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x15, #0x10\n"
+ "bge 12f\n"
+ "tbz x15, #3, 7f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 5f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 4f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x15, #1, 6f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x15, #2, 9f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 8f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x15, #1, 10f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "14:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "15:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "cmp x11, #0x20\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 25f\n"
+ "cmp x11, #0x4\n"
+ "blt 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 21b\n"
+ "cbz x11, 25f\n"
+ "22:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 23f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 24f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 24f\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "25:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 34f\n"
+ "tbz x15, #3, 29f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 27f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 26f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 33f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 33f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 33f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 28f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 33f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 33f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 33f\n"
+ "29:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 30f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 33f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 33f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 33f\n"
+ "31:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 32f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 33f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "33:" // Height 1: Partial direct writeback: Done
+ "b 35f\n"
+ "34:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "35:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 212f\n"
+ "36:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 37f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "38:" // Height 2: Column loop
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x15, #0x10\n"
+ "bge 47f\n"
+ "tbz x15, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x15, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x15, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x15, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 55f\n"
+ "cmp x11, #0x20\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 60f\n"
+ "cmp x11, #0x4\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 56b\n"
+ "cbz x11, 60f\n"
+ "57:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 58f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 59f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 59f\n"
+ "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "59:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "60:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 69f\n"
+ "tbz x15, #3, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 62f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 61f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 68f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 63f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 68f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 66f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 65f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 68f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 67f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "68:" // Height 2: Partial direct writeback: Done
+ "b 70f\n"
+ "69:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "70:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 38b\n"
+ "b 212f\n"
+ "71:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "73:" // Height 3: Column loop
+ "tbz %x[flags], #0, 83f\n"
+ "cmp x15, #0x10\n"
+ "bge 82f\n"
+ "tbz x15, #3, 77f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 75f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 74f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 81f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 81f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x15, #1, 76f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 81f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 81f\n"
+ "77:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x15, #2, 79f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 78f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 81f\n"
+ "78:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 81f\n"
+ "79:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x15, #1, 80f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 81f\n"
+ "80:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "81:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 84f\n"
+ "82:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 84f\n"
+ "83:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "84:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "85:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 87f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 87f\n"
+ "86:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "87:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 90f\n"
+ "cmp x11, #0x20\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 88b\n"
+ "89:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "90:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 95f\n"
+ "cmp x11, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 91b\n"
+ "cbz x11, 95f\n"
+ "92:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 93f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 94f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 94f\n"
+ "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "94:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "95:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 85b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "bge 104f\n"
+ "tbz x15, #3, 99f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 97f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 96f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 103f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 103f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 103f\n"
+ "97:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 98f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 103f\n"
+ "98:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 103f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 103f\n"
+ "99:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 101f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 100f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 103f\n"
+ "100:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 103f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 103f\n"
+ "101:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 102f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 103f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "103:" // Height 3: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "105:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 73b\n"
+ "b 212f\n"
+ "106:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 107f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 108f\n"
+ "107:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "108:" // Height 4: Column loop
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x15, #0x10\n"
+ "bge 117f\n"
+ "tbz x15, #3, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 110f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 109f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 116f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 116f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x15, #1, 111f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 116f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 116f\n"
+ "112:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x15, #2, 114f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 113f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 116f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 116f\n"
+ "114:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x15, #1, 115f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "116:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 119f\n"
+ "118:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "119:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "120:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 122f\n"
+ "121:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "122:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 125f\n"
+ "cmp x11, #0x20\n"
+ "blt 124f\n"
+ "123:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 123b\n"
+ "124:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "125:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 130f\n"
+ "cmp x11, #0x4\n"
+ "blt 127f\n"
+ "126:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 126b\n"
+ "cbz x11, 130f\n"
+ "127:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 128f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 129f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 129f\n"
+ "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "129:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "130:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 131f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 138f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 138f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 138f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 138f\n"
+ "134:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 135f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 138f\n"
+ "135:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 138f\n"
+ "136:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 138f\n"
+ "137:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "138:" // Height 4: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "140:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 108b\n"
+ "b 212f\n"
+ "141:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 143f\n"
+ "142:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "143:" // Height 5: Column loop
+ "tbz %x[flags], #0, 153f\n"
+ "cmp x15, #0x10\n"
+ "bge 152f\n"
+ "tbz x15, #3, 147f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 145f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 144f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 151f\n"
+ "144:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 151f\n"
+ "145:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x15, #1, 146f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 151f\n"
+ "146:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 151f\n"
+ "147:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x15, #2, 149f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 148f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 151f\n"
+ "148:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 151f\n"
+ "149:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x15, #1, 150f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 151f\n"
+ "150:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "151:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 154f\n"
+ "152:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 154f\n"
+ "153:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "154:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "155:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 156f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 157f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 157f\n"
+ "156:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "157:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 160f\n"
+ "cmp x11, #0x20\n"
+ "blt 159f\n"
+ "158:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 158b\n"
+ "159:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "160:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 165f\n"
+ "cmp x11, #0x4\n"
+ "blt 162f\n"
+ "161:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 161b\n"
+ "cbz x11, 165f\n"
+ "162:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 163f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 164f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 164f\n"
+ "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "164:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "165:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 155b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 174f\n"
+ "tbz x15, #3, 169f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 167f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 166f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 173f\n"
+ "166:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 173f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 173f\n"
+ "167:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 168f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 173f\n"
+ "168:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 173f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 173f\n"
+ "169:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 171f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 170f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 173f\n"
+ "170:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 173f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 173f\n"
+ "171:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 172f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 173f\n"
+ "172:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "173:" // Height 5: Partial direct writeback: Done
+ "b 175f\n"
+ "174:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "175:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "b 212f\n"
+ "176:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 177f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 178f\n"
+ "177:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "178:" // Height 6: Column loop
+ "tbz %x[flags], #0, 188f\n"
+ "cmp x15, #0x10\n"
+ "bge 187f\n"
+ "tbz x15, #3, 182f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 180f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 179f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 186f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 186f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x15, #1, 181f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 186f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 186f\n"
+ "182:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x15, #2, 184f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 183f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 186f\n"
+ "183:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 186f\n"
+ "184:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x15, #1, 185f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 186f\n"
+ "185:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "186:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 189f\n"
+ "187:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 189f\n"
+ "188:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "189:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "190:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 191f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 192f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 192f\n"
+ "191:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "192:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 195f\n"
+ "cmp x11, #0x20\n"
+ "blt 194f\n"
+ "193:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 193b\n"
+ "194:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "195:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 200f\n"
+ "cmp x11, #0x4\n"
+ "blt 197f\n"
+ "196:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 196b\n"
+ "cbz x11, 200f\n"
+ "197:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 198f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 199f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 199f\n"
+ "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "199:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "200:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 190b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 209f\n"
+ "tbz x15, #3, 204f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 202f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 201f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 208f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 208f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 208f\n"
+ "202:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 203f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 208f\n"
+ "203:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 208f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 208f\n"
+ "204:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 206f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 205f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 208f\n"
+ "205:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 208f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 208f\n"
+ "206:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 207f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 208f\n"
+ "207:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "208:" // Height 6: Partial direct writeback: Done
+ "b 210f\n"
+ "209:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "210:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 178b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 212f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 211f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "211:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "212:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index e5a88b4519..5b4a7f3e86 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,38 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
-#include <cstdint>
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_hybrid_u8qa_dot_4x16( ARGLIST );
-class hybrid_u8u32_dot_16x4
+class cls_a64_hybrid_u8qa_dot_4x16
{
public:
typedef uint8_t operand_type;
- typedef uint32_t result_type;
+ typedef uint8_t result_type;
- typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+ kern_type kernel=a64_hybrid_u8qa_dot_4x16;
- hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_u8u32_dot_16x4_a55;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..ff12472063
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_dot_4x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 94f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 63f\n"
+ "beq 32f\n"
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 12f\n"
+ "cmp x27, #0x20\n"
+ "blt 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x11, #0x80]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 11f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "12:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 19f\n"
+ "cmp x27, #0x4\n"
+ "blt 15f\n"
+ "13:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "tbnz %x[flags], #31, 14f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "14:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ "bge 13b\n"
+ "cbz x27, 19f\n"
+ "15:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 16f\n"
+ "ldr h0, [x26], #0x2\n"
+ "tbz x27, #0, 17f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "b 17f\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "17:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 18f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "18:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ "19:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 20f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "neg v1.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "20:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add x10, x10, #0x40\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 21f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "21:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 30f\n"
+ "tbz x12, #3, 25f\n"
+ "str d16, [x9], #0x8\n"
+ "tbz x12, #2, 23f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "tbz x12, #1, 22f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "b 29f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "b 29f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 24f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "b 29f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "b 29f\n"
+ "25:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 27f\n"
+ "str s16, [x9], #0x4\n"
+ "tbz x12, #1, 26f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "b 29f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "b 29f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 28f\n"
+ "str h16, [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "b 29f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "29:" // Height 1: Partial direct writeback: Done
+ "b 31f\n"
+ "30:" // Height 1: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "31:" // Height 1: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 3b\n"
+ "b 126f\n"
+ "32:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 33f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "34:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "35:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "36:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 38f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "38:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 43f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 42f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "42:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "43:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 50f\n"
+ "cmp x27, #0x4\n"
+ "blt 46f\n"
+ "44:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "tbnz %x[flags], #31, 45f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "45:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ "bge 44b\n"
+ "cbz x27, 50f\n"
+ "46:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 47f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x27, #0, 48f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 48f\n"
+ "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "48:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 49f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "49:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ "50:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 36b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 51f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v2.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "51:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 52f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "52:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 61f\n"
+ "tbz x12, #3, 56f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x12, #2, 54f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x12, #1, 53f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 60f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 60f\n"
+ "54:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 55f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 60f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 60f\n"
+ "56:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 58f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x12, #1, 57f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 60f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 60f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 59f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 60f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "60:" // Height 2: Partial direct writeback: Done
+ "b 62f\n"
+ "61:" // Height 2: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "62:" // Height 2: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 34b\n"
+ "b 126f\n"
+ "63:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 64f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 65f\n"
+ "64:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "65:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "66:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "67:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 68f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 69f\n"
+ "68:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "69:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 74f\n"
+ "cmp x27, #0x20\n"
+ "blt 72f\n"
+ "70:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 71f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "71:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bge 70b\n"
+ "72:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "74:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 81f\n"
+ "cmp x27, #0x4\n"
+ "blt 77f\n"
+ "75:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 76f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "76:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ "bge 75b\n"
+ "cbz x27, 81f\n"
+ "77:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 78f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "tbz x27, #0, 79f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "79:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 80f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "80:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
+ "81:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 67b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 82f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "82:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "83:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 92f\n"
+ "tbz x12, #3, 87f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x12, #2, 85f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x12, #1, 84f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 91f\n"
+ "84:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 91f\n"
+ "85:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 86f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 91f\n"
+ "86:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 91f\n"
+ "87:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 89f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x12, #1, 88f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 91f\n"
+ "88:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 91f\n"
+ "89:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 90f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "91:" // Height 3: Partial direct writeback: Done
+ "b 93f\n"
+ "92:" // Height 3: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "93:" // Height 3: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 65b\n"
+ "b 126f\n"
+ "94:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 95f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "96:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "97:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "98:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 100f\n"
+ "99:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "100:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 105f\n"
+ "cmp x27, #0x20\n"
+ "blt 103f\n"
+ "101:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 102f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "102:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bge 101b\n"
+ "103:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15c // udot v28.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09d // udot v29.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0be // udot v30.4s, v5.16b, v3.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0df // udot v31.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fc // udot v28.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91d // udot v29.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6f83e93e // udot v30.4s, v9.16b, v3.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6f83e95f // udot v31.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89c // udot v28.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bd // udot v29.4s, v5.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8de // udot v30.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8ff // udot v31.4s, v7.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 104f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "104:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "105:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 112f\n"
+ "cmp x27, #0x4\n"
+ "blt 108f\n"
+ "106:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
+ "bge 106b\n"
+ "cbz x27, 112f\n"
+ "108:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 109f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x27, #0, 110f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 110f\n"
+ "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "110:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 111f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "111:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f83e0bc // udot v28.4s, v5.16b, v3.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0fe // udot v30.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f83e11f // udot v31.4s, v8.16b, v3.4b[0]\n"
+ "112:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 113f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "113:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 114f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "114:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 123f\n"
+ "tbz x12, #3, 118f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x12, #2, 116f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x12, #1, 115f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 122f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 122f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 117f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 122f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 122f\n"
+ "118:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 120f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x12, #1, 119f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 122f\n"
+ "119:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 122f\n"
+ "120:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 121f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 122f\n"
+ "121:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "122:" // Height 4: Partial direct writeback: Done
+ "b 124f\n"
+ "123:" // Height 4: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "124:" // Height 4: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 96b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 126f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "125:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "126:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
deleted file mode 100644
index 735e5fd45a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- uint32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
- uint32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
deleted file mode 100644
index 2e86233a06..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- uint32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
- uint32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
new file mode 100644
index 0000000000..238c1825f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint32_t>, \
+ const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_dot_6x16
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_u8u32_dot_6x16;
+
+ cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3c8654147a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+ const uint32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 176f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 141f\n"
+ "beq 106f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 71f\n"
+ "beq 36f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x15, #0x10\n"
+ "bge 12f\n"
+ "tbz x15, #3, 7f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 5f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 4f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x15, #1, 6f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x15, #2, 9f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 8f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x15, #1, 10f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "14:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "15:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "cmp x11, #0x20\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 25f\n"
+ "cmp x11, #0x4\n"
+ "blt 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 21b\n"
+ "cbz x11, 25f\n"
+ "22:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 23f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 24f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 24f\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "25:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 34f\n"
+ "tbz x15, #3, 29f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 27f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 26f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 33f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 33f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 33f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 28f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 33f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 33f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 33f\n"
+ "29:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 30f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 33f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 33f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 33f\n"
+ "31:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 32f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 33f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "33:" // Height 1: Partial direct writeback: Done
+ "b 35f\n"
+ "34:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "35:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 212f\n"
+ "36:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 37f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "38:" // Height 2: Column loop
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x15, #0x10\n"
+ "bge 47f\n"
+ "tbz x15, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x15, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x15, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x15, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 55f\n"
+ "cmp x11, #0x20\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 60f\n"
+ "cmp x11, #0x4\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 56b\n"
+ "cbz x11, 60f\n"
+ "57:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 58f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 59f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 59f\n"
+ "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "59:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "60:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 69f\n"
+ "tbz x15, #3, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 62f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 61f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 68f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 63f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 68f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 66f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 65f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 68f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 67f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "68:" // Height 2: Partial direct writeback: Done
+ "b 70f\n"
+ "69:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "70:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 38b\n"
+ "b 212f\n"
+ "71:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "73:" // Height 3: Column loop
+ "tbz %x[flags], #0, 83f\n"
+ "cmp x15, #0x10\n"
+ "bge 82f\n"
+ "tbz x15, #3, 77f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 75f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 74f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 81f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 81f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x15, #1, 76f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 81f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 81f\n"
+ "77:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x15, #2, 79f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 78f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 81f\n"
+ "78:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 81f\n"
+ "79:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x15, #1, 80f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 81f\n"
+ "80:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "81:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 84f\n"
+ "82:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 84f\n"
+ "83:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "84:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "85:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 87f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 87f\n"
+ "86:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "87:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 90f\n"
+ "cmp x11, #0x20\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 88b\n"
+ "89:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "90:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 95f\n"
+ "cmp x11, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 91b\n"
+ "cbz x11, 95f\n"
+ "92:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 93f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 94f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 94f\n"
+ "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "94:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "95:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 85b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "bge 104f\n"
+ "tbz x15, #3, 99f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 97f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 96f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 103f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 103f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 103f\n"
+ "97:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 98f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 103f\n"
+ "98:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 103f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 103f\n"
+ "99:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 101f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 100f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 103f\n"
+ "100:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 103f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 103f\n"
+ "101:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 102f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 103f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "103:" // Height 3: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "105:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 73b\n"
+ "b 212f\n"
+ "106:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 107f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 108f\n"
+ "107:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "108:" // Height 4: Column loop
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x15, #0x10\n"
+ "bge 117f\n"
+ "tbz x15, #3, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 110f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 109f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 116f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 116f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x15, #1, 111f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 116f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 116f\n"
+ "112:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x15, #2, 114f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 113f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 116f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 116f\n"
+ "114:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x15, #1, 115f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "116:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 119f\n"
+ "118:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "119:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "120:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 122f\n"
+ "121:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "122:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 125f\n"
+ "cmp x11, #0x20\n"
+ "blt 124f\n"
+ "123:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 123b\n"
+ "124:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "125:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 130f\n"
+ "cmp x11, #0x4\n"
+ "blt 127f\n"
+ "126:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 126b\n"
+ "cbz x11, 130f\n"
+ "127:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 128f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 129f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 129f\n"
+ "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "129:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "130:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 131f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 138f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 138f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 138f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 138f\n"
+ "134:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 135f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 138f\n"
+ "135:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 138f\n"
+ "136:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 138f\n"
+ "137:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "138:" // Height 4: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "140:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 108b\n"
+ "b 212f\n"
+ "141:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 143f\n"
+ "142:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "143:" // Height 5: Column loop
+ "tbz %x[flags], #0, 153f\n"
+ "cmp x15, #0x10\n"
+ "bge 152f\n"
+ "tbz x15, #3, 147f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 145f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 144f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 151f\n"
+ "144:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 151f\n"
+ "145:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x15, #1, 146f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 151f\n"
+ "146:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 151f\n"
+ "147:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x15, #2, 149f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 148f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 151f\n"
+ "148:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 151f\n"
+ "149:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x15, #1, 150f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 151f\n"
+ "150:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "151:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 154f\n"
+ "152:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 154f\n"
+ "153:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "154:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "155:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 156f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 157f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 157f\n"
+ "156:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "157:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 160f\n"
+ "cmp x11, #0x20\n"
+ "blt 159f\n"
+ "158:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 158b\n"
+ "159:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "160:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 165f\n"
+ "cmp x11, #0x4\n"
+ "blt 162f\n"
+ "161:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 161b\n"
+ "cbz x11, 165f\n"
+ "162:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 163f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 164f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 164f\n"
+ "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "164:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "165:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 155b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 174f\n"
+ "tbz x15, #3, 169f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 167f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 166f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 173f\n"
+ "166:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 173f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 173f\n"
+ "167:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 168f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 173f\n"
+ "168:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 173f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 173f\n"
+ "169:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 171f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 170f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 173f\n"
+ "170:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 173f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 173f\n"
+ "171:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 172f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 173f\n"
+ "172:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "173:" // Height 5: Partial direct writeback: Done
+ "b 175f\n"
+ "174:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "175:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "b 212f\n"
+ "176:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 177f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 178f\n"
+ "177:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "178:" // Height 6: Column loop
+ "tbz %x[flags], #0, 188f\n"
+ "cmp x15, #0x10\n"
+ "bge 187f\n"
+ "tbz x15, #3, 182f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 180f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 179f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 186f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 186f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x15, #1, 181f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 186f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 186f\n"
+ "182:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x15, #2, 184f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 183f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 186f\n"
+ "183:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 186f\n"
+ "184:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x15, #1, 185f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 186f\n"
+ "185:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "186:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 189f\n"
+ "187:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 189f\n"
+ "188:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "189:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "190:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 191f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 192f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 192f\n"
+ "191:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "192:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 195f\n"
+ "cmp x11, #0x20\n"
+ "blt 194f\n"
+ "193:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 193b\n"
+ "194:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "195:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 200f\n"
+ "cmp x11, #0x4\n"
+ "blt 197f\n"
+ "196:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 196b\n"
+ "cbz x11, 200f\n"
+ "197:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 198f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 199f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 199f\n"
+ "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "199:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "200:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 190b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 209f\n"
+ "tbz x15, #3, 204f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 202f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 201f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 208f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 208f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 208f\n"
+ "202:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 203f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 208f\n"
+ "203:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 208f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 208f\n"
+ "204:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 206f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 205f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 208f\n"
+ "205:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 208f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 208f\n"
+ "206:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 207f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 208f\n"
+ "207:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "208:" // Height 6: Partial direct writeback: Done
+ "b 210f\n"
+ "209:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "210:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 178b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 212f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 211f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "211:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "212:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
deleted file mode 100644
index 58a51432fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-
-namespace arm_gemm {
-
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const bfloat16 *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- K /= 2;
- const long loops_count = (K / 2) - 1;
- const long tails_count = K % 2;
-
- for (int yb=0; yb<ablocks; yb++) {
- const bfloat16 *a_ptr0 = a_ptr;
- const bfloat16 *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- long loops = loops_count;
- long tails = tails_count;
-
- __asm __volatile (
- "movi v8.4s, #0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v9.4s, #0\n"
- "ldr q2, [%[b_ptr]]\n"
- "movi v10.4s, #0\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- "movi v11.4s, #0\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- "movi v12.4s, #0\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- "movi v13.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
- "movi v14.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
- "movi v15.4s, #0\n"
- "movi v16.4s, #0\n"
- "movi v17.4s, #0\n"
- "movi v18.4s, #0\n"
- "movi v19.4s, #0\n"
- "movi v20.4s, #0\n"
- "movi v21.4s, #0\n"
- "movi v22.4s, #0\n"
- "movi v23.4s, #0\n"
- "movi v24.4s, #0\n"
- "movi v25.4s, #0\n"
- "movi v26.4s, #0\n"
- "movi v27.4s, #0\n"
- "movi v28.4s, #0\n"
- "movi v29.4s, #0\n"
- "movi v30.4s, #0\n"
- "movi v31.4s, #0\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr]]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr]]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- "b.ne 2b\n"
- "1:\n"
- "cbz %[tails], 3f\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr]]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr]]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- "str q8, [%[c_ptr]]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- "str q12, [%[c_ptr], #0x10]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "str q16, [%[c_ptr], #0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- "str q9, [%[c_ptr], #0x30]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "b 4f\n"
- "3:\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- "str q8, [%[c_ptr]]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- "str q12, [%[c_ptr], #0x10]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "str q16, [%[c_ptr], #0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- "str q9, [%[c_ptr], #0x30]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "4:\n"
- "str q13, [%[c_ptr], #0x40]\n"
- "str q17, [%[c_ptr], #0x50]\n"
- "str q10, [%[c_ptr], #0x60]\n"
- "str q14, [%[c_ptr], #0x70]\n"
- "str q18, [%[c_ptr], #0x80]\n"
- "str q11, [%[c_ptr], #0x90]\n"
- "str q15, [%[c_ptr], #0xa0]\n"
- "str q19, [%[c_ptr], #0xb0]\n"
- "str q20, [%[c_ptr], #0xc0]\n"
- "str q24, [%[c_ptr], #0xd0]\n"
- "str q28, [%[c_ptr], #0xe0]\n"
- "str q21, [%[c_ptr], #0xf0]\n"
- "str q25, [%[c_ptr], #0x100]\n"
- "str q29, [%[c_ptr], #0x110]\n"
- "str q22, [%[c_ptr], #0x120]\n"
- "str q26, [%[c_ptr], #0x130]\n"
- "str q30, [%[c_ptr], #0x140]\n"
- "str q23, [%[c_ptr], #0x150]\n"
- "str q27, [%[c_ptr], #0x160]\n"
- "str q31, [%[c_ptr], #0x170]\n"
- "add %[c_ptr], %[c_ptr], #0x180\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [loops] "+r" (loops), [tails] "+r" (tails)
- :
- : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 95fed86c2f..2fea5ad2e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -31,10 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_dot_12x8 {
+class cls_a64_interleaved_bf16fp32_dot_8x12 {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -60,13 +59,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
- kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
+ kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
- interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+ cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::X1) {
- kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
- }
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 7ffae524dc..92149a5579 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 7fac59947e..b2c2407b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_mmla_12x8 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12 {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
- kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
+ kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
- interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 7f0eff29af..c476fcf171 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
@@ -87,13 +87,23 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
"movi v27.4s, #0\n"
"prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
"movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
"prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
"add %[b_ptr], %[b_ptr], #0x40\n"
"cbz %[loops], 1f\n"
@@ -105,19 +115,19 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"subs %[loops], %[loops], #0x1\n"
".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
"ldr q4, [%[b_ptr]]\n"
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
"ldr q5, [%[b_ptr], #0x10]\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
"ldr q6, [%[b_ptr], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 7bfb2291a9..b17b76f170 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_mmla_12x8 {
+class cls_a64_interleaved_s8s32_mmla_8x12 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
- kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
+ kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
- interleaved_s8s32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 7953510aa7..2093e75b8e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index d493517cf1..99dd0be0d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_mmla_12x8 {
+class cls_a64_interleaved_u8u32_mmla_8x12 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
- kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
+ kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
- interleaved_u8u32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index dcd15f0345..568e5d1098 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index 981ce34b49..d77e1b0ac2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -30,13 +30,13 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_x1(const float *, const float *, float *, int, int, int);
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
//
// This describes the characteristics of a family of kernels, in terms of
// the required interleave properties and the output block size.
@@ -44,7 +44,7 @@ void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, in
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class sgemm_12x8 {
+class cls_a64_sgemm_8x12 {
public:
typedef float operand_type;
typedef float result_type;
@@ -83,25 +83,25 @@ public:
}
}
- kern_type kernel=a64_sgemm_asimd_12x8;
+ kern_type kernel=a64_sgemm_asimd_8x12;
- sgemm_12x8(const CPUInfo *ci) {
+ cls_a64_sgemm_8x12(const CPUInfo *ci) {
// Select specific kernel if available
switch(ci->get_cpu_model()) {
case CPUModel::A53:
- kernel = a64_sgemm_asimd_12x8_a53;
+ kernel = a64_sgemm_asimd_8x12_a53;
break;
case CPUModel::A55r0:
- kernel = a64_sgemm_asimd_12x8_a55;
+ kernel = a64_sgemm_asimd_8x12_a55;
break;
case CPUModel::A55r1:
- kernel = a64_sgemm_asimd_12x8_a55r1;
+ kernel = a64_sgemm_asimd_8x12_a55r1;
break;
case CPUModel::X1:
- kernel = a64_sgemm_asimd_12x8_x1;
+ kernel = a64_sgemm_asimd_8x12_x1;
break;
default:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
index 5532485efb..f4b6e7b70f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
index e9f071f7f4..5f86da8ef3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
index 8a6fbacfad..7709ad1be6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
index 48dc46785e..dc72095a9b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
@@ -39,7 +39,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
index 63fdf4df9f..89f8ac2d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
@@ -39,7 +39,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
index 6f31efe6cb..5f7252f019 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
@@ -25,13 +25,15 @@
#ifdef __aarch64__
+
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x6(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_6x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_4x6
+class cls_a64_smallK_hybrid_fp32_mla_6x4
{
public:
typedef float operand_type;
@@ -73,9 +75,9 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
+ kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4;
- smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+ cls_a64_smallK_hybrid_fp32_mla_6x4(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index e2fec6af16..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_fp32_mla_4x6(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
index e9a094855a..a8e0c24eae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
@@ -25,13 +25,15 @@
#ifdef __aarch64__
+
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_8x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_4x8
+class cls_a64_smallK_hybrid_fp32_mla_8x4
{
public:
typedef float operand_type;
@@ -73,9 +75,9 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
+ kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4;
- smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+ cls_a64_smallK_hybrid_fp32_mla_8x4(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index 11888bce74..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
index fc087b73db..abf0eda008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_4x6
+class cls_a64_smallK_hybrid_s8s32_dot_6x4
{
public:
typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x6;
+ kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4;
- smallK_hybrid_s8s32_dot_4x6(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_s8s32_dot_6x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_s8s32_dot_4x6_a55;
+ kernel = a64_smallK_hybrid_s8s32_dot_6x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
index 2d6d2f064c..a9926602fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v22.d[1], temploadreg2\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
- "ins v22.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v23.d[1], temploadreg3\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
"add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
- "ins v23.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v24.d[1], temploadreg0\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr d21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ "ins v21.d[1], temploadreg1\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
"add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
index 88ad36a27a..9ff39719f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
index 3de708cc68..9f9c2a49db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_4x8
+class cls_a64_smallK_hybrid_s8s32_dot_8x4
{
public:
typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x8;
+ kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4;
- smallK_hybrid_s8s32_dot_4x8(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_s8s32_dot_8x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_s8s32_dot_4x8_a55;
+ kernel = a64_smallK_hybrid_s8s32_dot_8x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
index 7135f2eee6..aba6e0d100 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q26, [c_ptr2]\n"
- "movi v26.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "movi v26.4s, #0\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ins v22.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr d23, [%[b_ptr0], #0x70]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ins v23.d[1], temploadreg3\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
index c94e975754..7fcf853d2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q25, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
index 76931db4dd..5d48a52d42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_4x6
+class cls_a64_smallK_hybrid_u8u32_dot_6x4
{
public:
typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x6;
+ kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4;
- smallK_hybrid_u8u32_dot_4x6(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_u8u32_dot_6x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_u8u32_dot_4x6_a55;
+ kernel = a64_smallK_hybrid_u8u32_dot_6x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
index 02894d8327..dddf4c5aa2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v22.d[1], temploadreg2\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
- "ins v22.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v23.d[1], temploadreg3\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
"add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
- "ins v23.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v24.d[1], temploadreg0\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr d21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ "ins v21.d[1], temploadreg1\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
"add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
index fe69f744e2..10bd16aa59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
index d91416c3be..942f94b0bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_4x8
+class cls_a64_smallK_hybrid_u8u32_dot_8x4
{
public:
typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x8;
+ kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4;
- smallK_hybrid_u8u32_dot_4x8(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_u8u32_dot_8x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_u8u32_dot_4x8_a55;
+ kernel = a64_smallK_hybrid_u8u32_dot_8x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
index e70fb6955e..fcb546f51e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q26, [c_ptr2]\n"
- "movi v26.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "movi v26.4s, #0\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ins v22.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr d23, [%[b_ptr0], #0x70]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ins v23.d[1], temploadreg3\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
index 2a7dd3d88d..aeea051662 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q25, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
index 1bc8021e76..57fd9c909e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
@@ -23,34 +23,28 @@
*/
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef __aarch64__
+#include "../performance_parameters.hpp"
#include "../std_transforms_sve.hpp"
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-class hybrid_fp32_mla_4VLx4
+class cls_sve_gemv_fp32_mla_8VL
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
+ typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
static unsigned int out_width()
{
- return get_vector_length<float>() * 4;
+ return 8 * get_vector_length<float>();
}
static constexpr unsigned int k_unroll()
@@ -60,7 +54,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -73,17 +67,16 @@ public:
return true;
}
- StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+ kern_type kernel=sve_gemv_fp32_mla_8VL;
- hybrid_fp32_mla_4VLx4(const CPUInfo *)
+ cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
new file mode 100644
index 0000000000..c62e31936c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_gemv_fp32_mla_8VL (
+ const float *A_ptr, const float *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "cntw x24\n"
+ "add x23, %x[N], x24\n"
+ "sub x23, x23, #0x1\n"
+ "udiv x23, x23, x24\n"
+ "mov x22, %x[bias]\n"
+ "1:" // Column loop
+ "cmp x23, #0x8\n"
+ "bge 50f\n"
+ "cmp x23, #0x6\n"
+ "bgt 43f\n"
+ "beq 36f\n"
+ "cmp x23, #0x4\n"
+ "bgt 29f\n"
+ "beq 22f\n"
+ "cmp x23, #0x2\n"
+ "bgt 15f\n"
+ "beq 8f\n"
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "whilelt p1.s, XZR, %x[N]\n"
+ "cbz x22, 2f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ "b 3f\n"
+ "2:" // Width 1: no bias
+ "mov z24.b, #0x0\n"
+ "3:" // Width 1: setup done
+ "cmp x21, #0x4\n"
+ "ble 5f\n"
+ "4:" // Width 1: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z2.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 4b\n"
+ "5:" // Width 1: Multiply loop: Single iteration only
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z5.s, z0.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z6.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "6:" // Width 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 7f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "7:" // Width 1: No activation
+ "st1w { z24.s }, p1, [%x[output_ptr]]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #1\n"
+ "b 57f\n"
+ "8:" // Width 2
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "sub x19, %x[N], x24\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 9f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #2\n"
+ "b 10f\n"
+ "9:" // Width 2: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "10:" // Width 2: setup done
+ "cmp x21, #0x4\n"
+ "ble 12f\n"
+ "11:" // Width 2: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[1]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z4.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z5.s, z0.s[2]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z6.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 11b\n"
+ "12:" // Width 2: Multiply loop: Single iteration only
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z9.s, z0.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z10.s, z0.s[0]\n"
+ "subs x21, x21, #0x1\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z12.s, z0.s[1]\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z14.s, z0.s[2]\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "13:" // Width 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 14f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "14:" // Width 2: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #2\n"
+ "b 57f\n"
+ "15:" // Width 3
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x2\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 16f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ "b 17f\n"
+ "16:" // Width 3: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "17:" // Width 3: setup done
+ "cmp x21, #0x4\n"
+ "ble 19f\n"
+ "18:" // Width 3: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "cmp x21, #0x4\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[1]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z5.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z6.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[2]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z8.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z9.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z10.s, z0.s[3]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z11.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z12.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 18b\n"
+ "19:" // Width 3: Multiply loop: Single iteration only
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z13.s, z0.s[0]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z14.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z15.s, z0.s[0]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z17.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z18.s, z0.s[1]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z19.s, z0.s[2]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z20.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z21.s, z0.s[2]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z22.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z23.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z1.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "20:" // Width 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 21f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "21:" // Width 3: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #3\n"
+ "b 57f\n"
+ "22:" // Width 4
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x3\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 23f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "b 24f\n"
+ "23:" // Width 4: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "24:" // Width 4: setup done
+ "cmp x21, #0x4\n"
+ "ble 26f\n"
+ "25:" // Width 4: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z5.s, z0.s[1]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z6.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z7.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z9.s, z0.s[2]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z10.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z11.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z12.s, z0.s[2]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z15.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 25b\n"
+ "26:" // Width 4: Multiply loop: Single iteration only
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z17.s, z0.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z18.s, z0.s[0]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z19.s, z0.s[0]\n"
+ "fmla z27.s, z20.s, z0.s[0]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z21.s, z0.s[1]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z22.s, z0.s[1]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z23.s, z0.s[1]\n"
+ "fmla z27.s, z1.s, z0.s[1]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z2.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z3.s, z0.s[2]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z4.s, z0.s[2]\n"
+ "fmla z27.s, z5.s, z0.s[2]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z7.s, z0.s[3]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z9.s, z0.s[3]\n"
+ "27:" // Width 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 28f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "28:" // Width 4: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #4\n"
+ "b 57f\n"
+ "29:" // Width 5
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x4\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 30f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "addvl x22, x22, #5\n"
+ "b 31f\n"
+ "30:" // Width 5: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "31:" // Width 5: setup done
+ "cmp x21, #0x4\n"
+ "ble 33f\n"
+ "32:" // Width 5: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z6.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z7.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z8.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z27.s, z9.s, z0.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z12.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z13.s, z0.s[2]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z14.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z15.s, z0.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z17.s, z0.s[3]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z18.s, z0.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z19.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z20.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 32b\n"
+ "33:" // Width 5: Multiply loop: Single iteration only
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z21.s, z0.s[0]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z22.s, z0.s[0]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z23.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z1.s, z0.s[0]\n"
+ "fmla z28.s, z2.s, z0.s[0]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[1]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z4.s, z0.s[1]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z5.s, z0.s[1]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z6.s, z0.s[1]\n"
+ "fmla z28.s, z7.s, z0.s[1]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[2]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[2]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z11.s, z0.s[2]\n"
+ "fmla z28.s, z12.s, z0.s[2]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z17.s, z0.s[3]\n"
+ "34:" // Width 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 35f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "35:" // Width 5: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #5\n"
+ "b 57f\n"
+ "36:" // Width 6
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x5\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 37f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #6\n"
+ "b 38f\n"
+ "37:" // Width 6: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "38:" // Width 6: setup done
+ "cmp x21, #0x4\n"
+ "ble 40f\n"
+ "39:" // Width 6: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z10.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z11.s, z0.s[1]\n"
+ "fmla z29.s, z12.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z16.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z17.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z19.s, z0.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z20.s, z0.s[3]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z21.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z22.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z1.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 39b\n"
+ "40:" // Width 6: Multiply loop: Single iteration only
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z5.s, z0.s[0]\n"
+ "fmla z28.s, z6.s, z0.s[0]\n"
+ "fmla z29.s, z7.s, z0.s[0]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z11.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z12.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z0.s[1]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z16.s, z0.s[2]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z17.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z19.s, z0.s[2]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z21.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z22.s, z0.s[3]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z23.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z1.s, z0.s[3]\n"
+ "fmla z29.s, z2.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "41:" // Width 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 42f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "42:" // Width 6: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #6\n"
+ "b 57f\n"
+ "43:" // Width 7
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x6\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 44f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+ "addvl x22, x22, #7\n"
+ "b 45f\n"
+ "44:" // Width 7: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "45:" // Width 7: setup done
+ "cmp x21, #0x4\n"
+ "ble 47f\n"
+ "46:" // Width 7: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "fmla z30.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z11.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z12.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z13.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z30.s, z14.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z16.s, z0.s[2]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z17.s, z0.s[2]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z18.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z19.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z29.s, z20.s, z0.s[2]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z30.s, z21.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z22.s, z0.s[3]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z23.s, z0.s[3]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z1.s, z0.s[3]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z2.s, z0.s[3]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z3.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z29.s, z4.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z30.s, z5.s, z0.s[3]\n"
+ "bgt 46b\n"
+ "47:" // Width 7: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z6.s, z0.s[0]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z7.s, z0.s[0]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z8.s, z0.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z27.s, z9.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z10.s, z0.s[0]\n"
+ "fmla z29.s, z11.s, z0.s[0]\n"
+ "fmla z30.s, z12.s, z0.s[0]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z16.s, z0.s[1]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z18.s, z0.s[1]\n"
+ "fmla z30.s, z19.s, z0.s[1]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z21.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z22.s, z0.s[2]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z23.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z1.s, z0.s[2]\n"
+ "fmla z29.s, z2.s, z0.s[2]\n"
+ "fmla z30.s, z3.s, z0.s[2]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[3]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z5.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z6.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z7.s, z0.s[3]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z9.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z10.s, z0.s[3]\n"
+ "48:" // Width 7: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 49f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "49:" // Width 7: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+ "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #7\n"
+ "b 57f\n"
+ "50:" // Width 8
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x7\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 51f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "b 52f\n"
+ "51:" // Width 8: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "52:" // Width 8: setup done
+ "cmp x21, #0x4\n"
+ "ble 54f\n"
+ "53:" // Width 8: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z30.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z8.s, z0.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z10.s, z0.s[1]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z11.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z12.s, z0.s[1]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z14.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z15.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z16.s, z0.s[1]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z17.s, z0.s[2]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z18.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z20.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z21.s, z0.s[2]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z22.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z23.s, z0.s[2]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z1.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z2.s, z0.s[3]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z3.s, z0.s[3]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z4.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z5.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z6.s, z0.s[3]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z7.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z31.s, z9.s, z0.s[3]\n"
+ "bgt 53b\n"
+ "54:" // Width 8: Multiply loop: Single iteration only
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z10.s, z0.s[0]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z11.s, z0.s[0]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z12.s, z0.s[0]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z27.s, z13.s, z0.s[0]\n"
+ "fmla z28.s, z14.s, z0.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z15.s, z0.s[0]\n"
+ "fmla z30.s, z16.s, z0.s[0]\n"
+ "fmla z31.s, z17.s, z0.s[0]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z19.s, z0.s[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z21.s, z0.s[1]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z22.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z23.s, z0.s[1]\n"
+ "fmla z30.s, z1.s, z0.s[1]\n"
+ "fmla z31.s, z2.s, z0.s[1]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[2]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z4.s, z0.s[2]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z5.s, z0.s[2]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z6.s, z0.s[2]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z7.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z8.s, z0.s[2]\n"
+ "fmla z30.s, z9.s, z0.s[2]\n"
+ "fmla z31.s, z10.s, z0.s[2]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z12.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z13.s, z0.s[3]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z14.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z17.s, z0.s[3]\n"
+ "fmla z31.s, z18.s, z0.s[3]\n"
+ "55:" // Width 8: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 56f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmin z31.s, p2/M, z31.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "56:" // Width 8: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "subs x23, x23, #0x8\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "sub %x[N], %x[N], x24, LSL #3\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+ "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
+ "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #8\n"
+ "bgt 1b\n"
+ "57:" // Exit
+
+ : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+ : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+ : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 385a16fe10..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 1) / 2) * 2;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 1) / 2;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index eba98bb74d..e344d82dc6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,42 +10,49 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include "../bfloat.hpp"
#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<bfloat16>, \
+ size_t, size_t, \
+ const bfloat16 *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
-class hybrid_bf16fp32_dot_4VLx4
+class cls_sve_hybrid_bf16fp32_dot_6x4VL
{
public:
typedef bfloat16 operand_type;
typedef float result_type;
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -63,27 +70,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
- hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..19385e56ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -0,0 +1,2237 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+ size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmin z28.s, p5/M, z28.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "fmax z28.s, p5/M, z28.s, z1.s\n"
+ "fmin z29.s, p5/M, z29.s, z0.s\n"
+ "fmin z30.s, p5/M, z30.s, z0.s\n"
+ "fmin z31.s, p5/M, z31.s, z0.s\n"
+ "fmax z29.s, p5/M, z29.s, z1.s\n"
+ "fmax z30.s, p5/M, z30.s, z1.s\n"
+ "fmax z31.s, p5/M, z31.s, z1.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
deleted file mode 100644
index 641e5c12fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_4VLx4
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 2;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
-
- hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 76e3546c6f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[128];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "mov z14.s, #0\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z20.d, z16.d\n"
- "mov z21.d, z17.d\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp1 z5.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z5.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z22.d, z18.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z23.d, z19.d\n"
- "mov z24.d, z16.d\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z5.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.h, #0\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.h, #0\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z7.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "mov z20.d, z16.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z25.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z7.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p6/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp1 z13.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr7]\n"
- "mov z20.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.d, z18.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z25.d, z17.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z26.d, z18.d\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr7]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "ld1w z14.s, p0/z, [c_ptr7]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p7/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr7, #-0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p7/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr7, #0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "addvl a_ptr7, a_ptr7, #2\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p6/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p6/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "addvl a_ptr7, a_ptr7, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p0, [c_ptr7]\n"
- "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
- "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
deleted file mode 100644
index bd457e9d27..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_6VLx2
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 3;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 2, 6, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
-
- hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
deleted file mode 100644
index 59dc6dc540..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ /dev/null
@@ -1,1633 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[192];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z18.s, #0\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "trn2 z4.d, z2.d, z3.d\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "uzp1 z1.s, z22.s, z23.s\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "uzp1 z2.s, z24.s, z25.s\n"
- "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z26.d, z20.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z27.d, z21.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "mov z28.d, z22.d\n"
- "mov z29.d, z23.d\n"
- "mov z30.d, z24.d\n"
- "mov z31.d, z25.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "ld1w z17.s, p0/z, [c_ptr2]\n"
- "mov z18.s, #0\n"
- "zip1 z26.s, z17.s, z18.s\n"
- "zip2 z27.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z28.s, z17.s, z18.s\n"
- "zip2 z29.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z30.s, z17.s, z18.s\n"
- "zip2 z31.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "fmax z26.s, p7/m, z26.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z27.s, p7/m, z27.s, z18.s\n"
- "fmax z28.s, p7/m, z28.s, z18.s\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z19.s\n"
- "fmin z27.s, p7/m, z27.s, z19.s\n"
- "fmin z28.s, p7/m, z28.s, z19.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "uzp1 z6.s, z26.s, z27.s\n"
- "fmax z29.s, p7/m, z29.s, z18.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z30.s, p7/m, z30.s, z18.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "fmax z31.s, p7/m, z31.s, z18.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z19.s\n"
- "fmin z30.s, p7/m, z30.s, z19.s\n"
- "fmin z31.s, p7/m, z31.s, z19.s\n"
- "st1w z6.s, p0, [c_ptr2]\n"
- "uzp1 z7.s, z28.s, z29.s\n"
- "uzp1 z8.s, z30.s, z31.s\n"
- "st1w z7.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z8.s, p2, [c_ptr2, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z20.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z21.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z28.d, z22.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "mov z29.d, z23.d\n"
- "mov z30.d, z24.d\n"
- "mov z31.d, z25.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "ld1w z17.s, p0/z, [c_ptr2]\n"
- "ld1w z18.s, p0/z, [c_ptr3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z26.s, z17.s, z18.s\n"
- "zip2 z27.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z18.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z28.s, z17.s, z18.s\n"
- "zip2 z29.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "zip1 z30.s, z17.s, z18.s\n"
- "zip2 z31.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "fmax z26.s, p7/m, z26.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z27.s, p7/m, z27.s, z18.s\n"
- "fmax z28.s, p7/m, z28.s, z18.s\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z19.s\n"
- "fmin z27.s, p7/m, z27.s, z19.s\n"
- "fmin z28.s, p7/m, z28.s, z19.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "uzp1 z6.s, z26.s, z27.s\n"
- "uzp2 z7.s, z26.s, z27.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z29.s, p7/m, z29.s, z18.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "fmax z30.s, p7/m, z30.s, z18.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z31.s, p7/m, z31.s, z18.s\n"
- "fmin z29.s, p7/m, z29.s, z19.s\n"
- "fmin z30.s, p7/m, z30.s, z19.s\n"
- "st1w z6.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z19.s\n"
- "uzp1 z8.s, z28.s, z29.s\n"
- "uzp2 z9.s, z28.s, z29.s\n"
- "st1w z7.s, p0, [c_ptr3]\n"
- "uzp1 z10.s, z30.s, z31.s\n"
- "uzp2 z11.s, z30.s, z31.s\n"
- "st1w z8.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z9.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z11.s, p2, [c_ptr3, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
deleted file mode 100644
index f25f7473cb..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_8VLx2
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 2, 8, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
-
- hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
deleted file mode 100644
index f38a2ea2e3..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ /dev/null
@@ -1,2001 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z1.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "uzp1 z2.s, z20.s, z21.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "uzp1 z3.s, z22.s, z23.s\n"
- "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.d, z16.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "mov z28.d, z20.d\n"
- "mov z29.d, z21.d\n"
- "mov z30.d, z22.d\n"
- "mov z31.d, z23.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z8.s, p0, [c_ptr2]\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "st1w z9.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z10.s, z28.s, z29.s\n"
- "uzp1 z11.s, z30.s, z31.s\n"
- "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z11.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z28.d, z20.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "mov z29.d, z21.d\n"
- "mov z30.d, z22.d\n"
- "mov z31.d, z23.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr2]\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr3]\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z13.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z14.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z15.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
deleted file mode 100644
index 7610a20ac0..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3778 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- __fp16 nullbias[512];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
- }
- __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
- __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
- const __fp16 * const minptr = &minval;
- const __fp16 * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<__fp16>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const __fp16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(__fp16);
-
- __fp16 *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = leftovers;
- const __fp16 *a_ptr0 = a_ptr0_base;
- const __fp16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(__fp16);
- const __fp16 *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z24.h, p0/z, [c_ptr2]\n"
- "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "fmax z24.h, p7/m, z24.h, z14.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.h, p7/m, z25.h, z14.h\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.h, p7/m, z26.h, z14.h\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "fmin z24.h, p7/m, z24.h, z15.h\n"
- "fmin z25.h, p7/m, z25.h, z15.h\n"
- "fmax z27.h, p7/m, z27.h, z14.h\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.h, p7/m, z26.h, z15.h\n"
- "fmin z27.h, p7/m, z27.h, z15.h\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- "st1h z24.h, p0, [c_ptr2]\n"
- "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
- "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
- "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z24.h, p0/z, [c_ptr2]\n"
- "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1h z28.h, p0/z, [c_ptr3]\n"
- "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z28.h, z12.h, z7.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z29.h, z13.h, z7.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z30.h, z14.h, z7.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "fmla z31.h, z15.h, z7.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z28.h, z12.h, z7.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z29.h, z13.h, z7.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z30.h, z14.h, z7.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "fmla z31.h, z15.h, z7.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "fmax z24.h, p7/m, z24.h, z14.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.h, p7/m, z25.h, z14.h\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.h, p7/m, z26.h, z14.h\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "fmin z24.h, p7/m, z24.h, z15.h\n"
- "fmin z25.h, p7/m, z25.h, z15.h\n"
- "fmax z27.h, p7/m, z27.h, z14.h\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.h, p7/m, z26.h, z15.h\n"
- "fmax z28.h, p7/m, z28.h, z14.h\n"
- "fmax z29.h, p7/m, z29.h, z14.h\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.h, p7/m, z27.h, z15.h\n"
- "fmax z30.h, p7/m, z30.h, z14.h\n"
- "fmin z28.h, p7/m, z28.h, z15.h\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.h, p7/m, z29.h, z15.h\n"
- "fmax z31.h, p7/m, z31.h, z14.h\n"
- "fmin z30.h, p7/m, z30.h, z15.h\n"
- "st1h z24.h, p0, [c_ptr2]\n"
- "fmin z31.h, p7/m, z31.h, z15.h\n"
- "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
- "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
- "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
- "st1h z28.h, p0, [c_ptr3]\n"
- "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
- "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
- "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index ebef413848..0260050f29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,42 +10,48 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<__fp16>, \
+ size_t, size_t, \
+ const __fp16 *, \
+ IndirectOutputArg<__fp16>, \
+ const __fp16 *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
-class hybrid_fp16_mla_4VLx4
+class cls_sve_hybrid_fp16_mla_6x4VL
{
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
- typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -63,27 +69,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+ kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
- hybrid_fp16_mla_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..b19842b122
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -0,0 +1,3178 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+ size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+ const __fp16 *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const __fp16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<__fp16>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "13:" // Height 1: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "27:" // Height 2: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "add x27, x27, x19, LSL #1\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "41:" // Height 3: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "55:" // Height 4: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x23]\n"
+ "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmin z24.h, p5/M, z24.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "fmax z24.h, p5/M, z24.h, z1.h\n"
+ "fmin z25.h, p5/M, z25.h, z0.h\n"
+ "fmin z26.h, p5/M, z26.h, z0.h\n"
+ "fmin z27.h, p5/M, z27.h, z0.h\n"
+ "fmax z25.h, p5/M, z25.h, z1.h\n"
+ "fmax z26.h, p5/M, z26.h, z1.h\n"
+ "fmax z27.h, p5/M, z27.h, z1.h\n"
+ "69:" // Height 5: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1h { z24.h }, p4, [x23]\n"
+ "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "add x21, x23, x19, LSL #1\n"
+ "add %x[output_ptr], x21, x19, LSL #1\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x23]\n"
+ "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x21]\n"
+ "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z28.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z29.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z30.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z31.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "fmla z28.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "fmla z29.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z30.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z31.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "fmla z28.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "fmla z29.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z30.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z31.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "fmla z28.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "fmla z29.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z30.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z31.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "fmla z28.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "fmla z29.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z30.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z31.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "fmla z28.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "fmla z29.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z30.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z31.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "fmla z28.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "fmla z29.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z30.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z31.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "fmla z28.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "fmla z29.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z30.h, z6.h, z5.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z31.h, z7.h, z5.h[7]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "fmla z28.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "fmla z29.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z30.h, z6.h, z5.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z31.h, z7.h, z5.h[0]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "fmla z28.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "fmla z29.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z30.h, z6.h, z5.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z31.h, z7.h, z5.h[1]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "fmla z28.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "fmla z29.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z30.h, z6.h, z5.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z31.h, z7.h, z5.h[2]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "fmla z28.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "fmla z29.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z30.h, z6.h, z5.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z31.h, z7.h, z5.h[3]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "fmla z28.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "fmla z29.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z30.h, z6.h, z5.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z31.h, z7.h, z5.h[4]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "fmla z28.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "fmla z29.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z30.h, z6.h, z5.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z31.h, z7.h, z5.h[5]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "fmla z28.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "fmla z29.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z30.h, z6.h, z5.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z31.h, z7.h, z5.h[6]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "fmla z28.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "fmla z29.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z30.h, z6.h, z5.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z31.h, z7.h, z5.h[7]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmin z24.h, p5/M, z24.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "fmax z24.h, p5/M, z24.h, z1.h\n"
+ "fmin z25.h, p5/M, z25.h, z0.h\n"
+ "fmin z26.h, p5/M, z26.h, z0.h\n"
+ "fmin z27.h, p5/M, z27.h, z0.h\n"
+ "fmin z28.h, p5/M, z28.h, z0.h\n"
+ "fmax z25.h, p5/M, z25.h, z1.h\n"
+ "fmax z26.h, p5/M, z26.h, z1.h\n"
+ "fmax z27.h, p5/M, z27.h, z1.h\n"
+ "fmax z28.h, p5/M, z28.h, z1.h\n"
+ "fmin z29.h, p5/M, z29.h, z0.h\n"
+ "fmin z30.h, p5/M, z30.h, z0.h\n"
+ "fmin z31.h, p5/M, z31.h, z0.h\n"
+ "fmax z29.h, p5/M, z29.h, z1.h\n"
+ "fmax z30.h, p5/M, z30.h, z1.h\n"
+ "fmax z31.h, p5/M, z31.h, z1.h\n"
+ "83:" // Height 6: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1h { z24.h }, p4, [x23]\n"
+ "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1h { z28.h }, p4, [x21]\n"
+ "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
deleted file mode 100644
index ce3624340e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2118 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long leftovers = K;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = leftovers;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z28.s, z12.s, z7.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z29.s, z13.s, z7.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z30.s, z14.s, z7.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "fmla z31.s, z15.s, z7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z28.s, z12.s, z7.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z29.s, z13.s, z7.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z30.s, z14.s, z7.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "fmla z31.s, z15.s, z7.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
new file mode 100644
index 0000000000..f0cc70b76e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32_mla_6x4VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
+
+ cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..3a6422abd1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -0,0 +1,2236 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x4\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x4\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x4\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x4\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x4\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x4\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z28.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z29.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z30.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z31.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "fmla z28.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "fmla z29.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z30.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z31.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "fmla z28.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "fmla z29.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z30.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z31.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "fmla z28.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "fmla z29.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z30.s, z6.s, z5.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z31.s, z7.s, z5.s[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "fmla z28.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "fmla z29.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z30.s, z6.s, z5.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z31.s, z7.s, z5.s[0]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "fmla z28.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "fmla z29.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z30.s, z6.s, z5.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z31.s, z7.s, z5.s[1]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "fmla z28.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "fmla z29.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z30.s, z6.s, z5.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z31.s, z7.s, z5.s[2]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "fmla z28.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "fmla z29.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z30.s, z6.s, z5.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z31.s, z7.s, z5.s[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmin z28.s, p5/M, z28.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "fmax z28.s, p5/M, z28.s, z1.s\n"
+ "fmin z29.s, p5/M, z29.s, z0.s\n"
+ "fmin z30.s, p5/M, z30.s, z0.s\n"
+ "fmin z31.s, p5/M, z31.s, z0.s\n"
+ "fmax z29.s, p5/M, z29.s, z1.s\n"
+ "fmax z30.s, p5/M, z30.s, z1.s\n"
+ "fmax z31.s, p5/M, z31.s, z1.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0x18\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index fd416ed2f4..20d9922e93 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
-class hybrid_fp32_mmla_4VLx4
+class cls_sve_hybrid_fp32_mla_8x1VL
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -50,12 +56,12 @@ public:
static unsigned int out_width()
{
- return get_vector_length<float>() * 2;
+ return get_vector_length<float>() * 1;
}
static constexpr unsigned int k_unroll()
{
- return 2;
+ return 1;
}
static constexpr bool supports_accumulate()
@@ -63,27 +69,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
+ kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
- hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
new file mode 100644
index 0000000000..361e303c7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -0,0 +1,1751 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x8\n"
+ "bge 99f\n"
+ "cmp %x[M], #0x6\n"
+ "bgt 85f\n"
+ "beq 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 4f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z24.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x16, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "cbnz x16, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x15, #0x4\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 3b\n"
+ "b 114f\n"
+ "15:" // Height 2
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 18f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x16, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "cbnz x16, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "23:" // Height 2: input setup done
+ "cmp x15, #0x4\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 17b\n"
+ "b 114f\n"
+ "29:" // Height 3
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "add x11, x11, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 32f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x16, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "cbnz x16, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "37:" // Height 3: input setup done
+ "cmp x15, #0x4\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 40f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "ble 40f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "ble 40f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 31b\n"
+ "b 114f\n"
+ "43:" // Height 4
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 46f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x16, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "cbnz x16, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "51:" // Height 4: input setup done
+ "cmp x15, #0x4\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 54f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "ble 54f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "ble 54f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 45b\n"
+ "b 114f\n"
+ "57:" // Height 5
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 60f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x16, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "cbnz x16, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "65:" // Height 5: input setup done
+ "cmp x15, #0x4\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 68f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "ble 68f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "ble 68f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 59b\n"
+ "b 114f\n"
+ "71:" // Height 6
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 74f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x16, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "cbnz x16, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "79:" // Height 6: input setup done
+ "cmp x15, #0x4\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 82f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "ble 82f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "ble 82f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 73b\n"
+ "b 114f\n"
+ "85:" // Height 7
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 86f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 87f\n"
+ "86:" // Height 7: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "87:" // Height 7: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 88f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "mov z30.d, z24.d\n"
+ "b 90f\n"
+ "88:" // Height 7: no bias
+ "tbz %x[flags], #0, 89f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "ld1w { z30.s }, p1/Z, [x23]\n"
+ "b 90f\n"
+ "89:" // Height 7: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "90:" // Height 7: setup done
+ "mov x16, #0x0\n"
+ "91:" // Height 7: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 92f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "cbnz x16, 93f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 93f\n"
+ "92:" // Height 7: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "93:" // Height 7: input setup done
+ "cmp x15, #0x4\n"
+ "ble 95f\n"
+ "94:" // Height 7: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "bgt 94b\n"
+ "95:" // Height 7: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z30.s, z12.s, z6.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 96f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "fmla z30.s, z13.s, z6.s[1]\n"
+ "ble 96f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "fmla z30.s, z14.s, z6.s[2]\n"
+ "ble 96f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "fmla z30.s, z15.s, z6.s[3]\n"
+ "96:" // Height 7: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 91b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 97f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "97:" // Height 7: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z30.s }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "98:" // Height 7: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 87b\n"
+ "b 114f\n"
+ "99:" // Height 8
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 100f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "ldr x21, [%x[output_ptr], #0x38]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 101f\n"
+ "100:" // Height 8: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "101:" // Height 8: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 102f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "mov z30.d, z24.d\n"
+ "mov z31.d, z24.d\n"
+ "b 104f\n"
+ "102:" // Height 8: no bias
+ "tbz %x[flags], #0, 103f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "ld1w { z30.s }, p1/Z, [x23]\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
+ "b 104f\n"
+ "103:" // Height 8: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "104:" // Height 8: setup done
+ "mov x16, #0x0\n"
+ "105:" // Height 8: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 106f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x38]\n"
+ "cbnz x16, 107f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 107f\n"
+ "106:" // Height 8: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "107:" // Height 8: input setup done
+ "cmp x15, #0x4\n"
+ "ble 109f\n"
+ "108:" // Height 8: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z31.s, z8.s, z7.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z31.s, z9.s, z7.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z31.s, z10.s, z7.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "fmla z31.s, z11.s, z7.s[3]\n"
+ "bgt 108b\n"
+ "109:" // Height 8: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z30.s, z12.s, z6.s[0]\n"
+ "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z31.s, z12.s, z7.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 110f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "fmla z30.s, z13.s, z6.s[1]\n"
+ "fmla z31.s, z13.s, z7.s[1]\n"
+ "ble 110f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "fmla z30.s, z14.s, z6.s[2]\n"
+ "fmla z31.s, z14.s, z7.s[2]\n"
+ "ble 110f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "fmla z30.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "110:" // Height 8: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 105b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 111f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmin z31.s, p2/M, z31.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "111:" // Height 8: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z30.s }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "st1w { z31.s }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "112:" // Height 8: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 101b\n"
+ "subs %x[M], %x[M], #0x8\n"
+ "beq 114f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 113f\n"
+ "add x20, x20, #0x8\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "113:" // Update direct input
+ "mov x19, #0x20\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "114:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 1364585604..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 1) / 2) * 2;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long leftovers = K;
- const long blocks_count = (K + 1) / 2;
- float nullbias[128];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.s, #0\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "mov z14.s, #0\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z20.d, z16.d\n"
- "mov z21.d, z17.d\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.s, #0\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.s, #0\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp1 z5.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z5.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z22.d, z18.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z23.d, z19.d\n"
- "mov z24.d, z16.d\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z5.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.s, #0\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.s, #0\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z7.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "mov z20.d, z16.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z25.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z7.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p6/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp1 z13.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr7]\n"
- "mov z20.d, z16.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.d, z18.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z25.d, z17.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z26.d, z18.d\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr7]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "ld1w z14.s, p0/z, [c_ptr7]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p7/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p7/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "addvl a_ptr7, a_ptr7, #2\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p6/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p6/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "addvl a_ptr7, a_ptr7, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p0, [c_ptr7]\n"
- "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
- "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c500f43fe0..0150ce8fd9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include <cstdint>
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
-class hybrid_s8s32_dot_4VLx4
+class cls_sve_hybrid_s8qa_dot_4x4VL
{
public:
typedef int8_t operand_type;
- typedef int32_t result_type;
+ typedef int8_t result_type;
- typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
- hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..2b1448bd65
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_dot_4x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 46f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 31f\n"
+ "beq 16f\n"
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "ble 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ "bgt 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 12f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "12:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 13f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z1.s }, p2/Z, [x19]\n"
+ "neg z1.s, p2/M, z1.s\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z1.s\n"
+ "13:" // Height 1: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z17.s, z17.s, z1.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "tbz %x[flags], #5, 14f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "14:" // Height 1: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "15:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 3b\n"
+ "b 62f\n"
+ "16:" // Height 2
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 17f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "18:" // Height 2: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "19:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "20:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 21f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 22f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 22f\n"
+ "21:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "22:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "ble 25f\n"
+ "23:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "tbnz %x[flags], #31, 24f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "24:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "bgt 23b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "ble 26f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "ble 26f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "ble 26f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 27f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "27:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 20b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x19]\n"
+ "neg z2.s, p2/M, z2.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z2.s\n"
+ "mul z12.s, p2/M, z12.s, z2.s\n"
+ "28:" // Height 2: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "tbz %x[flags], #5, 29f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "29:" // Height 2: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "30:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 18b\n"
+ "b 62f\n"
+ "31:" // Height 3
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 32f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 33f\n"
+ "32:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "33:" // Height 3: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "37:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "ble 40f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "tbnz %x[flags], #31, 39f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "39:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bgt 38b\n"
+ "40:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z26.s, z8.b, z2.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "sdot z27.s, z9.b, z2.b[0]\n"
+ "ble 41f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z25.s, z4.b, z2.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z2.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "sdot z27.s, z6.b, z2.b[1]\n"
+ "ble 41f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z25.s, z8.b, z2.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z26.s, z9.b, z2.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "ble 41f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z25.s, z5.b, z2.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z26.s, z6.b, z2.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ "41:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 42f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "42:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 43f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z3.s }, p2/Z, [x19]\n"
+ "neg z3.s, p2/M, z3.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov z12.s, z12.s[0]\n"
+ "saddv d13, p0, z13.s\n"
+ "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z3.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z3.s\n"
+ "43:" // Height 3: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "tbz %x[flags], #5, 44f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "44:" // Height 3: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "45:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 33b\n"
+ "b 62f\n"
+ "46:" // Height 4
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 47f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "48:" // Height 4: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "49:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "50:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 52f\n"
+ "51:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "52:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "ble 55f\n"
+ "53:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z28.s, z4.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z29.s, z5.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "sdot z30.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "sdot z31.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "sdot z28.s, z8.b, z3.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "sdot z29.s, z9.b, z3.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "sdot z30.s, z10.b, z3.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "sdot z31.s, z4.b, z3.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "sdot z28.s, z5.b, z3.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z29.s, z6.b, z3.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z30.s, z7.b, z3.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "sdot z28.s, z9.b, z3.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z29.s, z10.b, z3.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z30.s, z4.b, z3.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "sdot z31.s, z5.b, z3.b[3]\n"
+ "tbnz %x[flags], #31, 54f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "sdot z14.s, z3.b, z15.b\n"
+ "54:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 53b\n"
+ "55:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[0]\n"
+ "sdot z29.s, z7.b, z3.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z26.s, z8.b, z2.b[0]\n"
+ "sdot z30.s, z8.b, z3.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "sdot z27.s, z9.b, z2.b[0]\n"
+ "sdot z31.s, z9.b, z3.b[0]\n"
+ "ble 56f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z10.b, z3.b[1]\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z25.s, z4.b, z2.b[1]\n"
+ "sdot z29.s, z4.b, z3.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z3.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "sdot z27.s, z6.b, z2.b[1]\n"
+ "sdot z31.s, z6.b, z3.b[1]\n"
+ "ble 56f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z7.b, z3.b[2]\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z25.s, z8.b, z2.b[2]\n"
+ "sdot z29.s, z8.b, z3.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z26.s, z9.b, z2.b[2]\n"
+ "sdot z30.s, z9.b, z3.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "sdot z31.s, z10.b, z3.b[2]\n"
+ "ble 56f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z4.b, z3.b[3]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z25.s, z5.b, z2.b[3]\n"
+ "sdot z29.s, z5.b, z3.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z26.s, z6.b, z2.b[3]\n"
+ "sdot z30.s, z6.b, z3.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ "sdot z31.s, z7.b, z3.b[3]\n"
+ "56:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 57f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "sdot z14.s, z3.b, z15.b\n"
+ "57:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 58f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "neg z4.s, p2/M, z4.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov x19, #0x4\n"
+ "mov z12.s, z12.s[0]\n"
+ "saddv d13, p0, z13.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mul z11.s, p2/M, z11.s, z4.s\n"
+ "saddv d14, p0, z14.s\n"
+ "mul z12.s, p2/M, z12.s, z4.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z4.s\n"
+ "mov z14.s, z14.s[0]\n"
+ "mul z14.s, p2/M, z14.s, z4.s\n"
+ "58:" // Height 4: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "tbz %x[flags], #5, 59f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "and z9.d, z28.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "and z10.d, z29.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "and z4.d, z30.d, z0.d\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "and z5.d, z31.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z9.s\n"
+ "sqadd z29.s, z29.s, z10.s\n"
+ "sqadd z30.s, z30.s, z4.s\n"
+ "sqadd z31.s, z31.s, z5.s\n"
+ "59:" // Height 4: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "add z28.s, z28.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
+ ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "addvl x23, x23, #1\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "60:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 48b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 62f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "61:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "62:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
new file mode 100644
index 0000000000..d8562898aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_dot_6x4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
+
+ cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..4a4af6356c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ struct KernelArgs {
+ const int32_t *multiplier_ptr = {};
+ const int32_t *shift_ptr = {};
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->per_channel_requant) {
+ flags |= 0x10;
+ ka.multiplier_ptr=qp->per_channel_muls + col_base;
+ ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+ }
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "bgt 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z9.s, z9.s, z1.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "tbz %x[flags], #4, 11f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 12f\n"
+ "11:" // Height 1: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "12:" // Height 1: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ "tbz %x[flags], #5, 13f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "13:" // Height 1: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "17:" // Height 2: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "18:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "19:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 21f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "21:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 19b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "add z9.s, z9.s, z1.s\n"
+ "addvl x16, x16, #4\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "tbz %x[flags], #4, 25f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 26f\n"
+ "25:" // Height 2: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "26:" // Height 2: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ "tbz %x[flags], #5, 27f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "27:" // Height 2: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "add x27, x27, x19\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "31:" // Height 3: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "32:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "33:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 34f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 35f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 35f\n"
+ "34:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "35:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 37f\n"
+ "36:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "bgt 36b\n"
+ "37:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "38:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 33b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "tbz %x[flags], #4, 39f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 40f\n"
+ "39:" // Height 3: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "40:" // Height 3: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ "tbz %x[flags], #5, 41f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "41:" // Height 3: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "45:" // Height 4: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "46:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "47:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 48f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 49f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 49f\n"
+ "48:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "49:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 51f\n"
+ "50:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "bgt 50b\n"
+ "51:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "52:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 47b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "tbz %x[flags], #4, 53f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 54f\n"
+ "53:" // Height 4: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "54:" // Height 4: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ "tbz %x[flags], #5, 55f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "55:" // Height 4: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "59:" // Height 5: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "60:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "61:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 62f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 63f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 63f\n"
+ "62:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "63:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 65f\n"
+ "64:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "bgt 64b\n"
+ "65:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "66:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 61b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "tbz %x[flags], #4, 67f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 68f\n"
+ "67:" // Height 5: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "68:" // Height 5: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
+ ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
+ "tbz %x[flags], #5, 69f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "and z4.d, z24.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "and z5.d, z25.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "and z6.d, z26.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "and z7.d, z27.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z5.s\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "sqadd z27.s, z27.s, z7.s\n"
+ "69:" // Height 5: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z26.s, z26.s, z4.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z27.s, z27.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "73:" // Height 6: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "74:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "75:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 76f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 77f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 77f\n"
+ "76:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "77:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 79f\n"
+ "78:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "bgt 78b\n"
+ "79:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "80:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 75b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ "tbz %x[flags], #4, 81f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 82f\n"
+ "81:" // Height 6: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "82:" // Height 6: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
+ ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n"
+ ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n"
+ ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "and z4.d, z24.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "and z5.d, z25.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "and z6.d, z26.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "and z7.d, z27.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z5.s\n"
+ "and z4.d, z28.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "and z5.d, z29.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z7.s\n"
+ "and z6.d, z30.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z4.s\n"
+ "and z7.d, z31.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z29.s, z29.s, z5.s\n"
+ "sqadd z30.s, z30.s, z6.s\n"
+ "sqadd z31.s, z31.s, z7.s\n"
+ "83:" // Height 6: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z26.s, z26.s, z4.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z27.s, z27.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ ".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n"
+ "add z28.s, z28.s, z4.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "addvl x23, x23, #1\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
deleted file mode 100644
index b30b8845a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(int32_t);
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z18.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z19.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z19.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z20.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z26.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z27.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "mov z20.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z21.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z26.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z28.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov z29.s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov z30.s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z31.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..1aebedb861
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int32_t>, \
+ const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_dot_6x4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
+
+ cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..cae9bf329f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+ const int32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 61f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "5:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "6:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 8f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 6b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "12:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 74f\n"
+ "13:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 14f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 15f\n"
+ "14:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "15:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 16f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 17f\n"
+ "16:" // Height 2: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "17:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "18:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 20f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 22f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "bgt 21b\n"
+ "22:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "23:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 18b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "24:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 15b\n"
+ "b 74f\n"
+ "25:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 26f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 27f\n"
+ "26:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "27:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 28f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 29f\n"
+ "28:" // Height 3: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "29:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "30:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 32f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 32f\n"
+ "31:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "32:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 34f\n"
+ "33:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "bgt 33b\n"
+ "34:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "35:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 30b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "36:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 27b\n"
+ "b 74f\n"
+ "37:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 38f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 39f\n"
+ "38:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "39:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 40f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 41f\n"
+ "40:" // Height 4: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "41:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "42:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 43f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 44f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 44f\n"
+ "43:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "44:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 46f\n"
+ "45:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "bgt 45b\n"
+ "46:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "47:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 42b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "48:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 39b\n"
+ "b 74f\n"
+ "49:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 50f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "51:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 52f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 53f\n"
+ "52:" // Height 5: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "53:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "54:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 56f\n"
+ "55:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "56:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 58f\n"
+ "57:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "bgt 57b\n"
+ "58:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "59:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "60:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 51b\n"
+ "b 74f\n"
+ "61:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 62f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 63f\n"
+ "62:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "63:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 64f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 65f\n"
+ "64:" // Height 6: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "65:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "66:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 67f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 68f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 68f\n"
+ "67:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "68:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 70f\n"
+ "69:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "bgt 69b\n"
+ "70:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "71:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 66b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "72:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 63b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 74f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "73:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "74:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c325e522d7..964f7cc2c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include <cstdint>
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
-class hybrid_u8u32_dot_4VLx4
+class cls_sve_hybrid_u8qa_dot_4x4VL
{
public:
typedef uint8_t operand_type;
- typedef uint32_t result_type;
+ typedef uint8_t result_type;
- typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
- hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..0a6546b78a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_dot_4x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 46f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 31f\n"
+ "beq 16f\n"
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "ble 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ "bgt 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 12f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "12:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 13f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z1.s }, p2/Z, [x19]\n"
+ "neg z1.s, p2/M, z1.s\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z1.s\n"
+ "13:" // Height 1: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z17.s, z17.s, z1.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "tbz %x[flags], #5, 14f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "14:" // Height 1: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "15:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 3b\n"
+ "b 62f\n"
+ "16:" // Height 2
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 17f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "18:" // Height 2: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "19:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "20:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 21f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 22f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 22f\n"
+ "21:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "22:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "ble 25f\n"
+ "23:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "tbnz %x[flags], #31, 24f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "24:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "bgt 23b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "ble 26f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "ble 26f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "ble 26f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 27f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "27:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 20b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x19]\n"
+ "neg z2.s, p2/M, z2.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z2.s\n"
+ "mul z12.s, p2/M, z12.s, z2.s\n"
+ "28:" // Height 2: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "tbz %x[flags], #5, 29f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "29:" // Height 2: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "30:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 18b\n"
+ "b 62f\n"
+ "31:" // Height 3
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 32f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 33f\n"
+ "32:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "33:" // Height 3: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "37:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "ble 40f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "tbnz %x[flags], #31, 39f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "39:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bgt 38b\n"
+ "40:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "udot z25.s, z7.b, z2.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z26.s, z8.b, z2.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "udot z27.s, z9.b, z2.b[0]\n"
+ "ble 41f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z25.s, z4.b, z2.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z2.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "udot z27.s, z6.b, z2.b[1]\n"
+ "ble 41f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z25.s, z8.b, z2.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z26.s, z9.b, z2.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "ble 41f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z25.s, z5.b, z2.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z26.s, z6.b, z2.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ "41:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 42f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "42:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 43f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z3.s }, p2/Z, [x19]\n"
+ "neg z3.s, p2/M, z3.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov z12.s, z12.s[0]\n"
+ "uaddv d13, p0, z13.s\n"
+ "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z3.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z3.s\n"
+ "43:" // Height 3: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "tbz %x[flags], #5, 44f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "44:" // Height 3: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "45:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 33b\n"
+ "b 62f\n"
+ "46:" // Height 4
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 47f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "48:" // Height 4: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "49:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "50:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 52f\n"
+ "51:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "52:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "ble 55f\n"
+ "53:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z28.s, z4.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z29.s, z5.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "udot z30.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "udot z31.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "udot z28.s, z8.b, z3.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "udot z29.s, z9.b, z3.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "udot z30.s, z10.b, z3.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "udot z31.s, z4.b, z3.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "udot z28.s, z5.b, z3.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z29.s, z6.b, z3.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z30.s, z7.b, z3.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "udot z28.s, z9.b, z3.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z29.s, z10.b, z3.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z30.s, z4.b, z3.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "udot z31.s, z5.b, z3.b[3]\n"
+ "tbnz %x[flags], #31, 54f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "udot z14.s, z3.b, z15.b\n"
+ "54:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 53b\n"
+ "55:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z2.b[0]\n"
+ "udot z29.s, z7.b, z3.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z26.s, z8.b, z2.b[0]\n"
+ "udot z30.s, z8.b, z3.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "udot z27.s, z9.b, z2.b[0]\n"
+ "udot z31.s, z9.b, z3.b[0]\n"
+ "ble 56f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z10.b, z3.b[1]\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z25.s, z4.b, z2.b[1]\n"
+ "udot z29.s, z4.b, z3.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z3.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "udot z27.s, z6.b, z2.b[1]\n"
+ "udot z31.s, z6.b, z3.b[1]\n"
+ "ble 56f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z7.b, z3.b[2]\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z25.s, z8.b, z2.b[2]\n"
+ "udot z29.s, z8.b, z3.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z26.s, z9.b, z2.b[2]\n"
+ "udot z30.s, z9.b, z3.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "udot z31.s, z10.b, z3.b[2]\n"
+ "ble 56f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z4.b, z3.b[3]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z25.s, z5.b, z2.b[3]\n"
+ "udot z29.s, z5.b, z3.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z26.s, z6.b, z2.b[3]\n"
+ "udot z30.s, z6.b, z3.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ "udot z31.s, z7.b, z3.b[3]\n"
+ "56:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 57f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "udot z14.s, z3.b, z15.b\n"
+ "57:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 58f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "neg z4.s, p2/M, z4.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov x19, #0x4\n"
+ "mov z12.s, z12.s[0]\n"
+ "uaddv d13, p0, z13.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mul z11.s, p2/M, z11.s, z4.s\n"
+ "uaddv d14, p0, z14.s\n"
+ "mul z12.s, p2/M, z12.s, z4.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z4.s\n"
+ "mov z14.s, z14.s[0]\n"
+ "mul z14.s, p2/M, z14.s, z4.s\n"
+ "58:" // Height 4: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "tbz %x[flags], #5, 59f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "and z9.d, z28.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "and z10.d, z29.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "and z4.d, z30.d, z0.d\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "and z5.d, z31.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z9.s\n"
+ "sqadd z29.s, z29.s, z10.s\n"
+ "sqadd z30.s, z30.s, z4.s\n"
+ "sqadd z31.s, z31.s, z5.s\n"
+ "59:" // Height 4: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "add z28.s, z28.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
+ ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "addvl x23, x23, #1\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "60:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 48b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 62f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "61:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "62:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 565832e8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z18.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z19.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z19.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z20.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z26.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z27.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "mov z20.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z21.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z26.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z28.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov z29.s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov z30.s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z31.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..af9de4a6eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint32_t>, \
+ const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_dot_6x4VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
+
+ cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..fc8ce636dd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+ const uint32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 61f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "5:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "6:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 8f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 6b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "12:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 74f\n"
+ "13:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 14f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 15f\n"
+ "14:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "15:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 16f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 17f\n"
+ "16:" // Height 2: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "17:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "18:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 20f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 22f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "bgt 21b\n"
+ "22:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "23:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 18b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "24:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 15b\n"
+ "b 74f\n"
+ "25:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 26f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 27f\n"
+ "26:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "27:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 28f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 29f\n"
+ "28:" // Height 3: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "29:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "30:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 32f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 32f\n"
+ "31:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "32:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 34f\n"
+ "33:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "bgt 33b\n"
+ "34:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "35:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 30b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "36:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 27b\n"
+ "b 74f\n"
+ "37:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 38f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 39f\n"
+ "38:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "39:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 40f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 41f\n"
+ "40:" // Height 4: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "41:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "42:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 43f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 44f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 44f\n"
+ "43:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "44:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 46f\n"
+ "45:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "bgt 45b\n"
+ "46:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "47:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 42b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "48:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 39b\n"
+ "b 74f\n"
+ "49:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 50f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "51:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 52f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 53f\n"
+ "52:" // Height 5: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "53:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "54:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 56f\n"
+ "55:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "56:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 58f\n"
+ "57:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "bgt 57b\n"
+ "58:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "59:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "60:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 51b\n"
+ "b 74f\n"
+ "61:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 62f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 63f\n"
+ "62:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "63:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 64f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 65f\n"
+ "64:" // Height 6: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "65:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "66:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 67f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 68f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 68f\n"
+ "67:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "68:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 70f\n"
+ "69:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "udot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "udot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "udot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "udot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "udot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "udot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "udot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "udot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "udot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z30.s, z6.b, z5.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z31.s, z7.b, z5.b[3]\n"
+ "bgt 69b\n"
+ "70:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "udot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "udot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z30.s, z6.b, z5.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "udot z31.s, z7.b, z5.b[0]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "udot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "udot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z30.s, z6.b, z5.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "udot z31.s, z7.b, z5.b[1]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "udot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "udot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z30.s, z6.b, z5.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z5.b[2]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "udot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "udot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z30.s, z6.b, z5.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z31.s, z7.b, z5.b[3]\n"
+ "71:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 66b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "72:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 63b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 74f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "73:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "74:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index 43107e45fa..12bb758b68 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_dot_3VLx8 {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
- kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
- interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 7e20ed0971..adee900337 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index f1353e2086..2889dd7f0f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_mmla_3VLx8 {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
- kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
- interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 16cc69b2a6..e43404e608 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 816c0cd095..eb946d9dfa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-class interleaved_fp16_mla_3VLx8 {
+class cls_sve_interleaved_fp16_mla_8x3VL {
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
- kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+ kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
- interleaved_fp16_mla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index f2050cbd56..46b8770409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index cce90fb135..b84ba83b6a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
-class interleaved_fp32_mla_3VLx8 {
+class cls_sve_interleaved_fp32_mla_8x3VL {
public:
typedef float operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
- kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+ kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
- interleaved_fp32_mla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index cd178c478a..1e05a308b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 4ca43cd5c9..96216960ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mmla_8x3VL(const float *, const float *, float *, int, int, int);
-class interleaved_fp32_mmla_3VLx8 {
+class cls_sve_interleaved_fp32_mmla_8x3VL {
public:
typedef float operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
- kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_fp32_mmla_8x3VL;
- interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index a404ae9c82..39daf0ff20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index e40ba215b4..3e16915cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_dot_3VLx8 {
+class cls_sve_interleaved_s8s32_dot_8x3VL {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
- interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index cdc70705c5..674c2400bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 361598d594..02b3451c54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_mmla_3VLx8 {
+class cls_sve_interleaved_s8s32_mmla_8x3VL {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
- interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index cde9ec32e9..578aa01732 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 252f38ec63..832a224199 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_dot_3VLx8 {
+class cls_sve_interleaved_u8u32_dot_8x3VL {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
- interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 6626f8463b..891869c767 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index ed44a9d8fc..4fdaab84bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_mmla_3VLx8 {
+class cls_sve_interleaved_u8u32_mmla_8x3VL {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
- interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 81a1dbcf51..fa08a9d091 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
index b555066195..2097d76a54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_1VLx8
+class cls_sve_smallK_hybrid_fp32_mla_8x1VL
{
public:
typedef float operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL;
- smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
index 5501688054..e07cfa8218 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<float>()) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
index eef1e4cc65..e50c05ba39 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_1VLx8
+class cls_sve_smallK_hybrid_s8s32_dot_8x1VL
{
public:
typedef int8_t operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL;
- smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index e2fbdcb61b..5770076d04 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<int32_t>()) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
"mov z27.s, #0\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
- "cbz %[loops], 2f\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
- "st1w z25.s, p7, [c_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
"st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
"sdot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #5\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "mov z24.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #7\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
"sdot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "sdot z25.s, z18.b, z1.b[2]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z28.s, z20.b, z4.b[0]\n"
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
"sdot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
"sdot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
index 70a0b12130..60184be043 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_1VLx8
+class cls_sve_smallK_hybrid_u8u32_dot_8x1VL
{
public:
typedef uint8_t operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL;
- smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
index 1d0b84e788..b980d9b5c2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
"mov z27.s, #0\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
- "cbz %[loops], 2f\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
- "st1w z25.s, p7, [c_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
"st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"udot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"udot z25.s, z19.b, z1.b[3]\n"
"udot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #5\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "mov z24.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #7\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"udot z25.s, z23.b, z1.b[3]\n"
"udot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "udot z25.s, z18.b, z1.b[2]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z28.s, z20.b, z4.b[0]\n"
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"udot z25.s, z19.b, z1.b[3]\n"
"udot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z28.s, z16.b, z4.b[0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"udot z25.s, z23.b, z1.b[3]\n"
"udot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"